In [29]:
import pandas as pd
import numpy as np
import json
import random
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier,EasyEnsembleClassifier,RUSBoostClassifier
from gensim.parsing.porter import PorterStemmer


In [30]:
with open('data_full.json', 'r') as file:
    all_data = json.load(file)
#     print(len(dic))
    
oos_train_df = pd.DataFrame(all_data["oos_train"])
oos_val_df = pd.DataFrame(all_data["oos_val"])
oos_test_df = pd.DataFrame(all_data["oos_test"])

ins_train_df = pd.DataFrame(all_data["train"])
ins_val_df = pd.DataFrame(all_data["val"])
ins_test_df = pd.DataFrame(all_data["test"])
# ins_train_df.describe()

In [31]:
seed = 2
random.seed(a=seed)
arr = random.sample([i for i in range(150)], k=20)
names = pd.unique(ins_train_df[1])
picked = [names[i] for i in arr]
# print(names)
in_train = ins_train_df.loc[ins_train_df[1].isin(picked)]
in_train.describe()

in_test = ins_test_df.loc[ins_test_df[1].isin(picked)]
in_val = ins_val_df.loc[ins_val_df[1].isin(picked)]

In [32]:
train = pd.concat([in_train, oos_train_df])
test = pd.concat([in_test, oos_test_df])
val = pd.concat([in_val, oos_val_df])

In [33]:
from gensim.utils import simple_preprocess
train['token'] = [simple_preprocess(line, deacc=True) for line in train[0]] 
test['token'] = [simple_preprocess(line, deacc=True) for line in test[0]] 
val['token'] = [simple_preprocess(line, deacc=True) for line in val[0]] 

In [34]:
in_train.head(10)
input_features, y_train = train['token'], train[1]
X_val, y_val = val['token'], val[1]
X_test, y_test = test['token'], test[1]

In [35]:
porter_stemmer = PorterStemmer()
new_input = pd.DataFrame(input_features).reset_index(drop=True)
a = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_input['token']]
# blah

new_test = pd.DataFrame(X_test).reset_index(drop=True)
b = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_test['token']]

new_val = pd.DataFrame(X_val).reset_index(drop=True)
c = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_val['token']]
# str(new_input[0][0]).split(" ")
# porter_stemmer.stem("alerts")

In [36]:
train_features = [' '.join(first) for first in a]
# input_features[0]

test_features = [' '.join(first) for first in b]

val_features = [' '.join(first) for first in c]

In [37]:
# vectorizer = CountVectorizer(analyzer=lambda x:x)
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3))
# X_train
X_train = vectorizer.fit_transform(train_features)

In [38]:
X_val = vectorizer.transform(val_features)
X_test = vectorizer.transform(test_features)

In [39]:

le = preprocessing.LabelEncoder()

le.fit(y_train)
encoded_y = le.transform(y_train)
# np.unique(encoded_y)

In [40]:
y_true_val = le.transform(y_val)
y_true_test = le.transform(y_test)

In [41]:
svc_l = SVC(kernel='linear')
#Rbf kernel
svc_r = SVC(kernel='rbf')
##Multinomial Naive Bayes
mnb = MultinomialNB()
###Random Forest
rfc = RandomForestClassifier()

In [42]:
models = [(svc_l, "SVM - Linear"), (svc_r, "SVM - RBF kernel"), (mnb, "Multinomial Naive Bayes"), (rfc, "Random Forest Classifier")]
# print("\t Recall\t Precision")
for model, name in models:
    model.fit(X_train, encoded_y)
    y_pred = model.predict(X_val)
    y_test = model.predict(X_test)
    print(name)
    print(classification_report(y_true_test, y_test, labels=[1,0], digits=4))
#     print(recall_test_score, prec_test_score)

SVM - Linear
              precision    recall  f1-score   support

           1     0.4902    0.8333    0.6173        30
           0     0.8824    1.0000    0.9375        30

   micro avg     0.6471    0.9167    0.7586        60
   macro avg     0.6863    0.9167    0.7774        60
weighted avg     0.6863    0.9167    0.7774        60

SVM - RBF kernel
              precision    recall  f1-score   support

           1     0.5227    0.7667    0.6216        30
           0     0.8529    0.9667    0.9062        30

   micro avg     0.6667    0.8667    0.7536        60
   macro avg     0.6878    0.8667    0.7639        60
weighted avg     0.6878    0.8667    0.7639        60

Multinomial Naive Bayes
              precision    recall  f1-score   support

           1     0.2727    0.8000    0.4068        30
           0     0.2857    1.0000    0.4444        30

   micro avg     0.2798    0.9000    0.4269        60
   macro avg     0.2792    0.9000    0.4256        60
weighted avg     0.2