In [8]:
import pandas as pd
import numpy as np
import pickle

connection = pickle.load(open('../data/connection_clean.pkl', 'rb'))
subject = pickle.load(open('../data/subject_clean.pkl', 'rb'))
objective = pickle.load(open('../data/objective_clean.pkl', 'rb'))

In [9]:
# TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
connection_tfidf = tfidf.fit_transform(connection['text'])
subject_tfidf = tfidf.fit_transform(subject['text'])
objective_tfidf = tfidf.fit_transform(objective['text'])

# append tfidf to dataframe
# connection['tfidf'] = list(connection_tfidf.toarray())
# subject['tfidf'] = list(subject_tfidf.toarray())
# objective['tfidf'] = list(objective_tfidf.toarray())

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

def make_logistic(x_train, x_test, y_train, y_test, _C = 1.0, multi = False):
    if multi:
        clf = MultiOutputClassifier(LogisticRegression(C=_C, solver='lbfgs', max_iter=1000))
    else:
        clf = LogisticRegression(C=_C, solver='lbfgs', max_iter=1000)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)
    print(classification_report(y_test, y_pred))
    return y_pred, y_pred_proba

In [23]:
from sklearn.ensemble import RandomForestClassifier

def make_rf(x_train, x_test, y_train, y_test, n_estimators=100, multi = False):
    if multi:
        clf = MultiOutputClassifier(RandomForestClassifier(n_estimators=n_estimators))
    else:
        clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)
    print(classification_report(y_test, y_pred))
    return y_pred, y_pred_proba

In [24]:
from sklearn.svm import SVC

def make_svm(x_train, x_test, y_train, y_test, _C = 1.0, multi = False):
    if multi:
        clf = MultiOutputClassifier(SVC(C=_C, probability=True))
    else:
        clf = SVC(C=_C, probability=True)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)
    print(classification_report(y_test, y_pred))
    return y_pred, y_pred_proba

In [25]:
from sklearn.neighbors import KNeighborsClassifier

def make_knn(x_train, x_test, y_train, y_test, n_neighbors=5, multi = False):
    if multi:
        clf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=n_neighbors))
    else:
        clf = KNeighborsClassifier(n_neighbors=n_neighbors)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_proba = clf.predict_proba(x_test)
    print(classification_report(y_test, y_pred))
    return y_pred, y_pred_proba

---

### CONNECTION

In [14]:
X = connection_tfidf
y = connection['connection']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print('connection: LR')
y_pred_LR, y_pred_proba_LR = make_logistic(x_train, x_test, y_train, y_test)
print('connection: RF')
y_pred_RF, y_pred_proba_RF = make_rf(x_train, x_test, y_train, y_test)
print('connection: SVM')
y_pred_SVM, y_pred_proba_SVM = make_svm(x_train, x_test, y_train, y_test)

connection: LR
              precision    recall  f1-score   support

  Disclosure       1.00      0.03      0.07        29
 Inquisition       0.56      1.00      0.72        36

    accuracy                           0.57        65
   macro avg       0.78      0.52      0.39        65
weighted avg       0.76      0.57      0.43        65

connection: RF
              precision    recall  f1-score   support

  Disclosure       1.00      0.10      0.19        29
 Inquisition       0.58      1.00      0.73        36

    accuracy                           0.60        65
   macro avg       0.79      0.55      0.46        65
weighted avg       0.77      0.60      0.49        65

connection: SVM
              precision    recall  f1-score   support

  Disclosure       1.00      0.03      0.07        29
 Inquisition       0.56      1.00      0.72        36

    accuracy                           0.57        65
   macro avg       0.78      0.52      0.39        65
weighted avg       0.76     

In [15]:
# tune hyperparameters
from sklearn.model_selection import GridSearchCV

# LR
parameters = {'C':[0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=1000), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_LR, y_pred_proba_LR = make_logistic(x_train, x_test, y_train, y_test, _C=clf.best_params_['C'])


{'C': 100}
0.6310344827586206
              precision    recall  f1-score   support

  Disclosure       1.00      0.24      0.39        29
 Inquisition       0.62      1.00      0.77        36

    accuracy                           0.66        65
   macro avg       0.81      0.62      0.58        65
weighted avg       0.79      0.66      0.60        65



In [16]:
# RF
parameters = {'n_estimators':[10, 100, 1000]}
clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_RF, y_pred_proba_RF = make_rf(x_train, x_test, y_train, y_test, n_estimators=clf.best_params_['n_estimators'])

{'n_estimators': 10}
0.6379310344827587
              precision    recall  f1-score   support

  Disclosure       0.80      0.14      0.24        29
 Inquisition       0.58      0.97      0.73        36

    accuracy                           0.60        65
   macro avg       0.69      0.56      0.48        65
weighted avg       0.68      0.60      0.51        65



In [17]:
# SVM
parameters = {'C':[0.1, 1, 10, 100]}
clf = GridSearchCV(SVC(probability=True), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_SVM, y_pred_proba_SVM = make_svm(x_train, x_test, y_train, y_test, _C=clf.best_params_['C'])

{'C': 10}
0.5705747126436782
              precision    recall  f1-score   support

  Disclosure       1.00      0.10      0.19        29
 Inquisition       0.58      1.00      0.73        36

    accuracy                           0.60        65
   macro avg       0.79      0.55      0.46        65
weighted avg       0.77      0.60      0.49        65



In [18]:
# knn
parameters = {'n_neighbors':[5, 10, 20]}
clf = GridSearchCV(KNeighborsClassifier(), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_knn, y_pred_proba_knn = make_knn(x_train, x_test, y_train, y_test, n_neighbors=clf.best_params_['n_neighbors'])

{'n_neighbors': 20}
0.6108045977011494
              precision    recall  f1-score   support

  Disclosure       0.52      0.86      0.65        29
 Inquisition       0.76      0.36      0.49        36

    accuracy                           0.58        65
   macro avg       0.64      0.61      0.57        65
weighted avg       0.66      0.58      0.56        65



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


---

## SUBJECT

In [26]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
subject['subject'] = le.fit_transform(subject['subject'])

In [28]:
X = subject_tfidf
y = subject['subject']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=subject['subject'])
print('subject: LR')
y_pred_LR, y_pred_proba_LR = make_logistic(x_train, x_test, y_train, y_test)
print('subject: RF')
y_pred_RF, y_pred_proba_RF = make_rf(x_train, x_test, y_train, y_test)
print('subject: SVM')
y_pred_SVM, y_pred_proba_SVM = make_svm(x_train, x_test, y_train, y_test)

subject: LR


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.52      1.00      0.69        34
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         5

    accuracy                           0.52        65
   macro avg       0.13      0.25      0.17        65
weighted avg       0.27      0.52      0.36        65

subject: RF
              precision    recall  f1-score   support

           0       0.52      1.00      0.69        34
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         5

    accuracy                           0.52        65
   macro avg       0.13      0.25      0.17        65
weighted avg       0.27      0.52      0.36        65

subject: SVM
              precision    recall  f1-score   support

           0       0.52      1.00      0.69       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
# tune
# LR
parameters = {'C':[0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=1000), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_LR, y_pred_proba_LR = make_logistic(x_train, x_test, y_train, y_test, _C=clf.best_params_['C'])

{'C': 100}
0.5501149425287356
              precision    recall  f1-score   support

           0       0.52      0.97      0.67        34
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         5

    accuracy                           0.51        65
   macro avg       0.13      0.24      0.17        65
weighted avg       0.27      0.51      0.35        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
parameters = {'n_estimators':[10, 100, 1000]}
clf = GridSearchCV(RandomForestClassifier(), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_RF, y_pred_proba_RF = make_rf(x_train, x_test, y_train, y_test, n_estimators=clf.best_params_['n_estimators'])

{'n_estimators': 100}
0.543448275862069
              precision    recall  f1-score   support

           0       0.52      1.00      0.69        34
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         5

    accuracy                           0.52        65
   macro avg       0.13      0.25      0.17        65
weighted avg       0.27      0.52      0.36        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
parameters = {'C':[0.1, 1, 10, 100]}
clf = GridSearchCV(SVC(probability=True), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_SVM, y_pred_proba_SVM = make_svm(x_train, x_test, y_train, y_test, _C=clf.best_params_['C'])

{'C': 1}
0.543448275862069
              precision    recall  f1-score   support

           0       0.52      1.00      0.69        34
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00        22
           3       0.00      0.00      0.00         5

    accuracy                           0.52        65
   macro avg       0.13      0.25      0.17        65
weighted avg       0.27      0.52      0.36        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# knn
parameters = {'n_neighbors':[5, 10, 15, 20]}
clf = GridSearchCV(KNeighborsClassifier(), parameters, cv=5)
clf.fit(x_train, y_train)
print(clf.best_params_)
print(clf.best_score_)
y_pred_KNN, y_pred_proba_KNN = make_knn(x_train, x_test, y_train, y_test, n_neighbors=clf.best_params_['n_neighbors'])

{'n_neighbors': 5}
0.5767816091954022
              precision    recall  f1-score   support

           0       0.57      0.76      0.65        34
           1       0.00      0.00      0.00         4
           2       0.50      0.41      0.45        22
           3       0.00      0.00      0.00         5

    accuracy                           0.54        65
   macro avg       0.27      0.29      0.28        65
weighted avg       0.46      0.54      0.49        65



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mo

---

### OBJECTIVE

In [28]:
X = objective_tfidf
# cols 1-13 are the 13 objectives
y = objective.iloc[:,1:14]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# LR
print('objective: LR')
y_pred_LR, y_pred_proba_LR = make_logistic(x_train, x_test, y_train, y_test, multi=True)
# RF
print('objective: RF')
y_pred_RF, y_pred_proba_RF = make_rf(x_train, x_test, y_train, y_test, multi=True)
# SVM
print('objective: SVM')
y_pred_SVM, y_pred_proba_SVM = make_svm(x_train, x_test, y_train, y_test, multi=True)
# knn
print('objective: KNN')
y_pred_KNN, y_pred_proba_KNN = make_knn(x_train, x_test, y_train, y_test, multi=True)


objective: LR


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       1.00      0.03      0.07        29
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         5
          11       0.00      0.00      0.00         5
          12       0.00      0.00      0.00         2

   micro avg       1.00      0.01      0.02        79
   macro avg       0.08      0.00      0.01        79
weighted avg       0.37      0.01      0.02        79
 samples avg       0.02      0.01      0.01        79

objective: RF


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       1.00      0.03      0.07        29
           3       0.00      0.00      0.00         5
           4       1.00      0.17      0.29         6
           5       0.00      0.00      0.00         7
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00        12
           8       0.00      0.00      0.00         1
           9       1.00      0.50      0.67         2
          10       0.00      0.00      0.00         5
          11       1.00      0.20      0.33         5
          12       0.00      0.00      0.00         2

   micro avg       1.00      0.05      0.10        79
   macro avg       0.31      0.07      0.10        79
weighted avg       0.53      0.05      0.08        79
 samples avg       0.03      0.03      0.03        79

objective: SVM
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
