In [1]:
import numpy as np
# import pandas as pd

In [2]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

In [4]:
import pickle

In [5]:
import pprint
import os
import sys
sys.path.append('../')
from src.load_data import load_data

## Load Data

In [6]:
train_data, valid_data, test_data, metadata = load_data()

In [7]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [8]:
scoring = 'f1_macro'
n_jobs=10


In [9]:
tra_sents = np.array([sentence['sentence']
     for article in train_data
     for sentence in article['sentences']])
y_tra = np.array([sentence['label'] for article in train_data for sentence in article['sentences']])

In [10]:
opt_sents = np.array([sentence['sentence']
     for article in (train_data + valid_data)
     for sentence in article['sentences']])

In [11]:
y_opt = np.array([sentence['label'] for article in (train_data + valid_data) for sentence in article['sentences']])

In [12]:
test_sents =  np.array([sentence['sentence']
     for article in test_data
     for sentence in article['sentences']])

In [13]:
y_test = np.array([sentence['label'] for article in test_data for sentence in article['sentences']])

In [14]:
len(tra_sents), len(y_tra), len(test_sents), len(y_test)

(3582, 3582, 441, 441)

# Feature Extraction

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.001
    max_df: 0.6
    stop_words: num_stopwords

In [16]:
vectorizer = TfidfVectorizer(min_df=0.001, max_df=0.12, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(opt_sents)

In [17]:
tra_vectors = vectorizer.transform(tra_sents)
test_vectors = vectorizer.transform(test_sents)

In [18]:
len(vectorizer.vocabulary_)

3010

#### Saving Feature Vectors

In [19]:
import pickle

feature_path = 'Data/features_mindf_001_maxdf_0_12_number_stopwords.pickle'

with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [20]:
tfidf_vectors.shape

(3981, 3010)

# Classifier Training
- With hyper-parameter optimization

In [21]:
opt_results = {}
# opt_results_path = 'Results/new_results_tfidf_only_mindf_001_maxdf_0_12.pickle'

### Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier


In [24]:
classifier = DecisionTreeClassifier(criterion='gini')

# Decision Tree
params = {
    'max_depth': [None] + [*range(15, 35, 5)],
    'min_samples_split': [*range(50, 200, 40)],
    'min_samples_leaf': [*range(5, 14, 2)],
    'max_features': [None, 'sqrt', 'log2']
}

dt_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
dt_clf = dt_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)

Best Score
0.7579141422864226
Best Params
{'max_depth': None, 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 170}


In [25]:
DT = DecisionTreeClassifier(criterion='gini', **dt_clf.best_params_)

In [26]:
DT.fit(tra_vectors, y_tra)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=170,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
y_pred = DT.predict(test_vectors)

In [28]:
opt_results['DecisionTree'] = {}
opt_results['DecisionTree']['GridSearchCV'] = dt_clf
opt_results['DecisionTree']['classif_report'] = classification_report(y_test, y_pred)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       325
           1       0.79      0.60      0.68       116

   micro avg       0.85      0.85      0.85       441
   macro avg       0.83      0.77      0.79       441
weighted avg       0.85      0.85      0.85       441



with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### RandomForestClassifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:
classifier = RandomForestClassifier(criterion='gini')

params = {
    'n_estimators': [30, 50, 70, 100, 150], 
    'max_depth': [None] + [*range(65, 120, 15)], 
    'min_samples_split': [10, 20, 25, 30, 40, 45, 50, 100],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

Best Score
0.7629026539994425
Best Params
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 100, 'n_estimators': 50}


In [24]:
RF = RandomForestClassifier(criterion='gini', **rf_clf.best_params_, random_state=0)

In [25]:
RF.fit(tra_vectors, y_tra)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
y_pred = RF.predict(test_vectors)

In [27]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_test, y_pred)

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88       325
           1       0.71      0.46      0.55       116

   micro avg       0.81      0.81      0.81       441
   macro avg       0.77      0.69      0.72       441
weighted avg       0.80      0.81      0.79       441



with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

###  SVC

In [37]:
from sklearn.svm import SVC

In [38]:
classifier =  SVC()
params = {
    'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'C': [0.025, 0.25, 0.5, 1, 2, 3, 4, 5],
}

svc_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

Best Score
0.749810978944412
Best Params
{'C': 3, 'kernel': 'linear'}


In [39]:
SV = SVC(**svc_clf.best_params_)

In [40]:
SV.fit(tra_vectors, y_tra)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [41]:
y_pred = SV.predict(test_vectors)

In [42]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_test, y_pred)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       325
           1       0.64      0.47      0.54       116

   micro avg       0.79      0.79      0.79       441
   macro avg       0.73      0.69      0.70       441
weighted avg       0.78      0.79      0.78       441



with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### MLPClassifier

In [48]:
from sklearn.neural_network import MLPClassifier

In [49]:
classifier = MLPClassifier()

params = {
    'hidden_layer_sizes': [(30), (50), (60), (70), (80), (100), (20)], 
    'activation': ['relu'], 
    'learning_rate': ['constant', 'adaptive'],
    'learning_rate_init': [0.01, 0.001, 0.1],
    'max_iter': [50, 100, 200, 300, 400, 500]
}
                        
mlp_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
mlp_clf = mlp_clf.fit(tfidf_vectors, y_opt)



# print('Best Estimator')
# print(mlp_clf.best_estimator_)
print('Best Score')
print(mlp_clf.best_score_)
print('Best Params')
print(mlp_clf.best_params_)

KeyboardInterrupt: 

In [None]:
ML = MLPClassifier(**mlp_clf.best_params_, random_state=42)

In [None]:
ML.fit(tra_vectors, y_tra)

In [None]:
y_pred = ML.predict(test_vectors)

In [None]:
opt_results['MLP'] = {}
opt_results['MLP']['GridSearchCV'] = mlp_clf
opt_results['MLP']['classif_report'] = classification_report(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

# Read the pickle

with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)


In [70]:
opt_results.keys()

dict_keys(['DecisionTree', 'RandomForest', 'SVC', 'MLP'])