In [25]:
import pandas as pd
import numpy as np
import json
import random
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier,EasyEnsembleClassifier,RUSBoostClassifier
from gensim.parsing.porter import PorterStemmer

In [26]:
with open('data_full.json', 'r') as file:
    all_data = json.load(file)
#     print(len(dic))
    
oos_train_df = pd.DataFrame(all_data["oos_train"])
oos_val_df = pd.DataFrame(all_data["oos_val"])
oos_test_df = pd.DataFrame(all_data["oos_test"])

ins_train_df = pd.DataFrame(all_data["train"])
ins_val_df = pd.DataFrame(all_data["val"])
ins_test_df = pd.DataFrame(all_data["test"])
# ins_train_df.describe()

In [27]:
seed = 2
random.seed(a=seed)
arr = random.sample([i for i in range(150)], k=20)
names = pd.unique(ins_train_df[1])
picked = [names[i] for i in arr]
# print(names)
in_train = ins_train_df.loc[ins_train_df[1].isin(picked)]
in_train.describe()

in_test = ins_test_df.loc[ins_test_df[1].isin(picked)]
in_val = ins_val_df.loc[ins_val_df[1].isin(picked)]

In [28]:
train = pd.concat([in_train, oos_train_df])
test = pd.concat([in_test, oos_test_df])
val = pd.concat([in_val, oos_val_df])

In [29]:
from gensim.utils import simple_preprocess
train['token'] = [simple_preprocess(line, deacc=True) for line in train[0]] 
test['token'] = [simple_preprocess(line, deacc=True) for line in test[0]] 
val['token'] = [simple_preprocess(line, deacc=True) for line in val[0]] 

In [30]:
in_train.head(10)
input_features, y_train = train['token'], train[1]
X_val, y_val = val['token'], val[1]
X_test, y_test = test['token'], test[1]

In [31]:
porter_stemmer = PorterStemmer()
new_input = pd.DataFrame(input_features).reset_index(drop=True)
a = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_input['token']]
# blah

new_test = pd.DataFrame(X_test).reset_index(drop=True)
b = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_test['token']]

new_val = pd.DataFrame(X_val).reset_index(drop=True)
c = [[porter_stemmer.stem(word) for word in str(text).split(" ")] for text in new_val['token']]
# str(new_input[0][0]).split(" ")
# porter_stemmer.stem("alerts")

In [32]:
train_features = [' '.join(first) for first in a]
# input_features[0]

test_features = [' '.join(first) for first in b]

val_features = [' '.join(first) for first in c]

In [33]:
# vectorizer = CountVectorizer(analyzer=lambda x:x)
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,3))
# X_train
X_train = vectorizer.fit_transform(train_features)

In [34]:
X_val = vectorizer.transform(val_features)
X_test = vectorizer.transform(test_features)

In [35]:
encoded_y = np.where(y_train!='oos', 0, 1)
y_true_val = np.where(y_val!='oos', 0, 1)
y_true_test = np.where(y_test!='oos', 0, 1)

In [36]:
brfc = BalancedRandomForestClassifier(sampling_strategy='not minority',random_state=111)
rusbc = RUSBoostClassifier(base_estimator=LogisticRegression(),sampling_strategy='not minority',random_state=111)
eec = EasyEnsembleClassifier(n_estimators=30,base_estimator=LogisticRegression(),replacement=True,sampling_strategy='not minority',random_state=111)
models = [(brfc,'Balanced Random Forest'),
          (rusbc,'Random Undersampling + Adaboost'),
          (eec,'Easy Ensemble')]

for model, name in models:
    model.fit(X_train, encoded_y)
#     y_pred = model.predict(X_val)
    y_test = model.predict(X_test)
    print(name)
    print(classification_report(y_true_test, y_test, labels=[1,0], digits=4))

Balanced Random Forest
              precision    recall  f1-score   support

           1     0.7739    0.9210    0.8411      1000
           0     0.8073    0.5517    0.6554       600

    accuracy                         0.7825      1600
   macro avg     0.7906    0.7363    0.7483      1600
weighted avg     0.7865    0.7825    0.7715      1600

Random Undersampling + Adaboost
              precision    recall  f1-score   support

           1     0.8182    0.5670    0.6698      1000
           0     0.5226    0.7900    0.6291       600

    accuracy                         0.6506      1600
   macro avg     0.6704    0.6785    0.6494      1600
weighted avg     0.7073    0.6506    0.6545      1600

Easy Ensemble
              precision    recall  f1-score   support

           1     0.8360    0.6780    0.7488      1000
           0     0.5919    0.7783    0.6724       600

    accuracy                         0.7156      1600
   macro avg     0.7139    0.7282    0.7106      1600
weigh