In [1]:
import numpy as np
# import pandas as pd

In [2]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

In [4]:
import pickle

In [5]:
import pprint
import os
import sys
sys.path.append('../')
from src.load_data import load_data

## Load Data

In [6]:
train_data, valid_data, test_data, metadata = load_data(project_folder='/scratch/sekiz/Master_Thesis')

In [7]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [8]:
scoring = 'f1_macro'
n_jobs=30


In [9]:
tra_sents = np.array([sentence['sentence']
     for article in train_data
     for sentence in article['sentences']])
y_tra = np.array([sentence['label'] for article in train_data for sentence in article['sentences']])

In [10]:
opt_sents = np.array([sentence['sentence']
     for article in (train_data + valid_data)
     for sentence in article['sentences']])

In [11]:
y_opt = np.array([sentence['label'] for article in (train_data + valid_data) for sentence in article['sentences']])

In [12]:
test_sents =  np.array([sentence['sentence']
     for article in test_data
     for sentence in article['sentences']])

In [13]:
y_test = np.array([sentence['label'] for article in test_data for sentence in article['sentences']])

In [14]:
len(tra_sents), len(y_tra), len(test_sents), len(y_test)

(3582, 3582, 441, 441)

# Feature Extraction

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.001
    max_df: 0.6
    stop_words: num_stopwords

In [16]:
vectorizer = TfidfVectorizer(min_df=0.001, max_df=0.6, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(opt_sents)

In [17]:
tra_vectors = vectorizer.transform(tra_sents)
test_vectors = vectorizer.transform(test_sents)

#### Saving Feature Vectors

In [18]:
import pickle

In [19]:
feature_path = 'Data/features_mindf_001_maxdf_6_number_stopwords.pickle'

In [20]:
with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
tfidf_vectors.shape

(3981, 3023)

# Classifier Training
- With hyper-parameter optimization

In [22]:
opt_results = {}
opt_results_path = 'Results/new_results_tfidf_only.pickle'

### Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier


In [41]:
classifier = DecisionTreeClassifier(criterion='gini')

# Decision Tree
params = {
    'max_depth': [None] + [*range(15, 35, 5)],
    'min_samples_split': [*range(50, 200, 40)],
    'min_samples_leaf': [*range(5, 14, 2)],
    'max_features': [None, 'sqrt', 'log2']
}

dt_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
dt_clf = dt_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)

Best Score
0.7285093845062899
Best Params
{'min_samples_split': 170, 'max_features': None, 'max_depth': None, 'min_samples_leaf': 5}


In [46]:
DT = DecisionTreeClassifier(criterion='gini', **dt_clf.best_params_)

In [48]:
DT.fit(tra_vectors, y_tra)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=170,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [49]:
y_pred = DT.predict(test_vectors)

In [50]:
opt_results['DecisionTree'] = {}
opt_results['DecisionTree']['GridSearchCV'] = dt_clf
opt_results['DecisionTree']['classif_report'] = classification_report(y_test, y_pred)

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       325
           1       0.71      0.44      0.54       116

    accuracy                           0.80       441
   macro avg       0.77      0.69      0.71       441
weighted avg       0.79      0.80      0.79       441



In [52]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### RandomForestClassifier

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
classifier = RandomForestClassifier(criterion='gini')

params = {
    'n_estimators': [30, 70, 100, 150], 
    'max_depth': [None] + [*range(65, 120, 15)], 
    'min_samples_split': [25, 30, 40, 45, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

Best Score
0.7374745064057092
Best Params
{'min_samples_split': 25, 'max_features': 'sqrt', 'max_depth': None, 'n_estimators': 70, 'bootstrap': False}


In [56]:
RF = RandomForestClassifier(criterion='gini', **rf_clf.best_params_)

In [57]:
RF.fit(tra_vectors, y_tra)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=25,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [58]:
y_pred = RF.predict(test_vectors)

In [60]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_test, y_pred)

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.97      0.88       325
           1       0.80      0.37      0.51       116

    accuracy                           0.81       441
   macro avg       0.80      0.67      0.69       441
weighted avg       0.81      0.81      0.78       441



In [62]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

###  SVC

In [63]:
from sklearn.svm import SVC

In [64]:
classifier =  SVC()
params = {
    'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'C': [0.025, 0.25, 0.5, 1, 2, 3],
}

svc_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

Best Score
0.7574574723616575
Best Params
{'kernel': 'linear', 'C': 2}


In [65]:
SV = SVC(**svc_clf.best_params_)

In [66]:
SV.fit(tra_vectors, y_tra)

SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [68]:
y_pred = SV.predict(test_vectors)

In [69]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_test, y_pred)

In [70]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.91      0.86       325
           1       0.63      0.45      0.53       116

    accuracy                           0.79       441
   macro avg       0.73      0.68      0.69       441
weighted avg       0.77      0.79      0.77       441



In [71]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### MLPClassifier

In [72]:
from sklearn.neural_network import MLPClassifier

In [None]:
classifier = MLPClassifier()

params = {
    'hidden_layer_sizes': [(10,5), (20,10), (20), (30,20)], 
    'activation': ['tanh', 'relu'], 
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.01, 0.001, 0.1],
    'max_iter': [50, 200, 400]
}
                        
mlp_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
mlp_clf = mlp_clf.fit(tfidf_vectors, y_opt)



# print('Best Estimator')
# print(mlp_clf.best_estimator_)
print('Best Score')
print(mlp_clf.best_score_)
print('Best Params')
print(mlp_clf.best_params_)

In [None]:
ML = MLPClassifier(**mlp_clf.best_params_)

In [None]:
ML.fit(tra_vectors, y_tra)

In [None]:
y_pred = ML.predict(test_vectors)

In [None]:
opt_results['MLP'] = {}
opt_results['MLP']['GridSearchCV'] = mlp_clf
opt_results['MLP']['classif_report'] = classification_report(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

# Read the pickle

In [21]:
with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)


In [22]:
opt_results.keys()

dict_keys(['RandomForest', 'DecisionTree', 'MLP', 'SVC'])