# One-Class Support Vector Machines

### Modelo 1 - Frequency of words

##### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import feather


from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif

#### Read FTR

In [2]:
#read ftr
data = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [3]:
data_y = data['polarityClass']
data = data.drop(columns=['polarityClass']) 
data.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
trainX, testX, trainy, testy = train_test_split(data, data_y, test_size=0.3, random_state=2)

In [5]:
testy.dtypes

dtype('uint8')

In [7]:
trainy = trainy.astype('int8')
print(trainy.dtypes)

int8


In [8]:
testy = testy.astype('int8')
print(testy.dtypes)

int8


In [9]:
trainy[trainy != 0] = -1 #anomaly
trainy[trainy == 0] = 1 #normal

testy[testy != 0] = -1 #anomaly
testy[testy == 0] = 1 #normal

In [10]:
testy.value_counts()

-1    12832
 1     2041
Name: polarityClass, dtype: int64

In [11]:
trainy.value_counts()

-1    30021
 1     4682
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 0

In [12]:
# Create model with best parameters
model = OneClassSVM(gamma='scale', nu=0.01)

# Fit the best model
model.fit(trainX,trainy)
model

OneClassSVM(nu=0.01)

In [13]:
predictions = model.predict(testX)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(testy,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted   -1      1    All
Actual                      
-1         675  12157  12832
1          131   1910   2041
All        806  14067  14873


In [14]:
# Accuracy
score = accuracy_score(testy,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.173805


In [15]:
# Report
print(classification_report(testy,predictions))

              precision    recall  f1-score   support

          -1       0.84      0.05      0.10     12832
           1       0.14      0.94      0.24      2041

    accuracy                           0.17     14873
   macro avg       0.49      0.49      0.17     14873
weighted avg       0.74      0.17      0.12     14873



#### Read FTR

In [16]:
#read ftr
data2 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [17]:
data2_y = data2['polarityClass']
data2 = data2.drop(columns=['polarityClass']) 
data2.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
train2X, test2X, train2y, test2y = train_test_split(data2, data2_y, test_size=0.3, random_state=2)

In [19]:
train2y = train2y.astype('int8')
print(train2y.dtypes)

int8


In [20]:
test2y = test2y.astype('int8')
print(test2y.dtypes)

int8


In [21]:
train2y[train2y != 1] = -1 #anomaly
train2y[train2y == 1] = 1 #normal

test2y[test2y != 1] = -1 #anomaly
test2y[test2y == 1] = 1 #normal

In [22]:
test2y.value_counts()

-1    13773
 1     1100
Name: polarityClass, dtype: int64

In [23]:
train2y.value_counts()

-1    32193
 1     2510
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 1

In [24]:
# Create model with best parameters
model = OneClassSVM(gamma='scale', nu=0.01)

# Fit the best model
model.fit(train2X,train2y)
model

OneClassSVM(nu=0.01)

In [25]:
predictions = model.predict(test2X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test2y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted   -1      1    All
Actual                      
-1         714  13059  13773
1           92   1008   1100
All        806  14067  14873


In [26]:
# Accuracy
score = accuracy_score(test2y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.115780


In [27]:
# Report
print(classification_report(test2y,predictions))

              precision    recall  f1-score   support

          -1       0.89      0.05      0.10     13773
           1       0.07      0.92      0.13      1100

    accuracy                           0.12     14873
   macro avg       0.48      0.48      0.12     14873
weighted avg       0.83      0.12      0.10     14873



#### Read FTR

In [28]:
#read ftr
data3 = pd.read_feather( '../output/dataframe1.ftr', columns=None, use_threads=True)

In [29]:
data3_y = data3['polarityClass']
data3 = data3.drop(columns=['polarityClass']) 
data3.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,maxes,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,3,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,3,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
train3X, test3X, train3y, test3y = train_test_split(data3, data3_y, test_size=0.3, random_state=2)

In [31]:
train3y = train3y.astype('int8')
print(train3y.dtypes)

int8


In [32]:
test3y = test3y.astype('int8')
print(test3y.dtypes)

int8


In [33]:
train3y[train3y != 2] = -1 #anomaly
train3y[train3y == 2] = 1 #normal

test3y[test3y != 2] = -1 #anomaly
test3y[test3y == 2] = 1 #normal

In [34]:
test3y.value_counts()

 1    11732
-1     3141
Name: polarityClass, dtype: int64

In [35]:
train3y.value_counts()

 1    27511
-1     7192
Name: polarityClass, dtype: int64

### Create and fit the model CLASS 2

In [36]:
# Create model with best parameters
model = OneClassSVM(gamma='scale', nu=0.01)

# Fit the best model
model.fit(train3X,train3y)
model

OneClassSVM(nu=0.01)

In [37]:
predictions = model.predict(test3X)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test3y,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted   -1      1    All
Actual                      
-1         223   2918   3141
1          583  11149  11732
All        806  14067  14873


In [38]:
# Accuracy
score = accuracy_score(test3y,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.764607


In [39]:
# Report
print(classification_report(test3y,predictions))

              precision    recall  f1-score   support

          -1       0.28      0.07      0.11      3141
           1       0.79      0.95      0.86     11732

    accuracy                           0.76     14873
   macro avg       0.53      0.51      0.49     14873
weighted avg       0.68      0.76      0.71     14873

