# One-Class XGBoost

### Modelo 1 - Frequency of words

##### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import feather

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif

#### Read FTR

In [2]:
#read ftr
data = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [3]:
data_y = data['polarityClass']
data = data.drop(columns=['polarityClass']) 
data.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
trainX, testX, trainy, testy = train_test_split(data, data_y, test_size=0.3, random_state=2)

In [5]:
testy.dtypes

dtype('uint8')

In [6]:
trainy = trainy.astype('int8')
print(trainy.dtypes)

int8


In [7]:
testy = testy.astype('int8')
print(testy.dtypes)

int8


In [8]:
trainy[trainy != 0] = -1 #anomaly
trainy[trainy == 0] = 1 #normal

testy[testy != 0] = -1 #anomaly
testy[testy == 0] = 1 #normal

In [9]:
testy.value_counts()

-1    12832
 1     2041
Name: polarityClass, dtype: int64

In [10]:
trainy.value_counts()

-1    30021
 1     4682
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 0

In [11]:
# Create model with best parameters
model = XGBClassifier()

# Fit the best model
model.fit(trainX,trainy)
model





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [12]:
predictions = model.predict(testX)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(testy,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )



Predicted     -1    1    All
Actual                      
-1         12621  211  12832
1           1298  743   2041
All        13919  954  14873


In [13]:
# Accuracy
score = accuracy_score(testy,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.898541


In [14]:
# Report
print(classification_report(testy,predictions))

              precision    recall  f1-score   support

          -1       0.91      0.98      0.94     12832
           1       0.78      0.36      0.50      2041

    accuracy                           0.90     14873
   macro avg       0.84      0.67      0.72     14873
weighted avg       0.89      0.90      0.88     14873



#### Read FTR

In [2]:
#read ftr
data2 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [3]:
data2_y = data2['polarityClass']
data2 = data2.drop(columns=['polarityClass']) 
data2.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train2X, test2X, train2y, test2y = train_test_split(data2, data2_y, test_size=0.3, random_state=2)

In [5]:
train2y = train2y.astype('int8')
print(train2y.dtypes)

int8


In [6]:
test2y = test2y.astype('int8')
print(test2y.dtypes)

int8


In [7]:
train2y[train2y != 1] = -1 #anomaly
train2y[train2y == 1] = 1 #normal

test2y[test2y != 1] = -1 #anomaly
test2y[test2y == 1] = 1 #normal

In [8]:
test2y.value_counts()

-1    13773
 1     1100
Name: polarityClass, dtype: int64

In [9]:
train2y.value_counts()

-1    32193
 1     2510
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 1

In [10]:
# Create model with best parameters
model = XGBClassifier()

# Fit the best model
model.fit(train2X,train2y)
model





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
predictions = model.predict(test2X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test2y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )



Predicted     -1    1    All
Actual                      
-1         13716   57  13773
1           1044   56   1100
All        14760  113  14873


In [12]:
# Accuracy
score = accuracy_score(test2y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.925973


In [13]:
# Report
print(classification_report(test2y,predictions))

              precision    recall  f1-score   support

          -1       0.93      1.00      0.96     13773
           1       0.50      0.05      0.09      1100

    accuracy                           0.93     14873
   macro avg       0.71      0.52      0.53     14873
weighted avg       0.90      0.93      0.90     14873



#### Read FTR

In [14]:
#read ftr
data3 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [15]:
data3_y = data3['polarityClass']
data3 = data3.drop(columns=['polarityClass']) 
data3.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
train3X, test3X, train3y, test3y = train_test_split(data3, data3_y, test_size=0.3, random_state=2)

In [17]:
train3y = train3y.astype('int8')
print(train3y.dtypes)

int8


In [18]:
test3y = test3y.astype('int8')
print(test3y.dtypes)

int8


In [19]:
train3y[train3y != 2] = -1 #anomaly
train3y[train3y == 2] = 1 #normal

test3y[test3y != 2] = -1 #anomaly
test3y[test3y == 2] = 1 #normal

In [20]:
test3y.value_counts()

 1    11732
-1     3141
Name: polarityClass, dtype: int64

In [21]:
train3y.value_counts()

 1    27511
-1     7192
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 2

In [22]:
# Create model with best parameters
model = XGBClassifier()

# Fit the best model
model.fit(train3X,train3y)
model





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
predictions = model.predict(test3X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test3y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

In [None]:
# Accuracy
score = accuracy_score(test3y,predictions)
print('Accuracy:{0:f}'.format(score))

In [None]:
# Report
print(classification_report(test3y,predictions))