In [20]:
import numpy as np
# import pandas as pd

In [21]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score as scikit_f1_score

In [23]:
import pickle

In [24]:
import pprint
import os
import sys
sys.path.append('../')
from src.load_data import load_data

## Load Data

In [7]:
train_data, valid_data, test_data, metadata = load_data('./..')

In [8]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [9]:
scoring = 'f1_macro'
n_jobs=10


In [10]:
tra_sents = np.array([sentence['sentence']
     for article in train_data
     for sentence in article['sentences']])
y_tra = np.array([sentence['label'] for article in train_data for sentence in article['sentences']])

In [11]:
opt_sents = np.array([sentence['sentence']
     for article in (train_data + valid_data)
     for sentence in article['sentences']])

In [12]:
y_opt = np.array([sentence['label'] for article in (train_data + valid_data) for sentence in article['sentences']])

In [13]:
test_sents =  np.array([sentence['sentence']
     for article in test_data
     for sentence in article['sentences']])

In [14]:
y_test = np.array([sentence['label'] for article in test_data for sentence in article['sentences']])

In [15]:
len(tra_sents), len(y_tra), len(test_sents), len(y_test)

(3582, 3582, 441, 441)

# Feature Extraction

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
vectorizer = TfidfVectorizer(min_df=0.001, max_df=0.12, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(opt_sents)

In [18]:
tra_vectors = vectorizer.transform(tra_sents)
test_vectors = vectorizer.transform(test_sents)

In [19]:
len(vectorizer.vocabulary_)

3010

#### Saving Feature Vectors

In [19]:
import pickle

In [20]:
feature_path = 'Data/070919_features_mindf_0_001_maxdf_0_12_number_stopwords.pickle'

In [24]:
with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
tfidf_vectors.shape

(3981, 3010)

# Classifier Training
- With hyper-parameter optimization

In [25]:
opt_results = {}
opt_results_path = 'Results/results_070919_tfidf_only_mindf_001_maxdf_0_12_numberstoplwords.pickle'

### Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier


In [24]:
classifier = DecisionTreeClassifier(criterion='gini')

# Decision Tree
params = {
    'max_depth': [None] + [*range(15, 35, 5)],
    'min_samples_split': [*range(50, 200, 20)],
    'min_samples_leaf': [*range(3, 14, 2)],
    'max_features': [None, 'sqrt', 'log2']
}

dt_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
dt_clf = dt_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)

Best Score
0.757883389796395
Best Params
{'max_features': None, 'min_samples_leaf': 5, 'max_depth': None, 'min_samples_split': 150}


In [25]:
DT = DecisionTreeClassifier(criterion='gini', **dt_clf.best_params_)

In [26]:
DT.fit(tra_vectors, y_tra)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=150,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [27]:
y_pred = DT.predict(test_vectors)

In [28]:
opt_results['DecisionTree'] = {}
opt_results['DecisionTree']['GridSearchCV'] = dt_clf
opt_results['DecisionTree']['classif_report'] = classification_report(y_test, y_pred)

In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       325
           1       0.79      0.60      0.68       116

   micro avg       0.85      0.85      0.85       441
   macro avg       0.83      0.77      0.79       441
weighted avg       0.85      0.85      0.85       441



In [None]:
scikit_f1_score(y_test, y_pred, 'macro')

In [30]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### RandomForestClassifier

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(criterion='gini')


params = {
    'n_estimators': [50] + [*range(100, 1000, 100)], 
    'max_depth': [None] + [*range(65, 115, 15)], 
    'min_samples_split': [10, 20, 30, 40, 50, 100, 120, 150],
    'max_features': ['sqrt', 'log2', 0.1, 0.2],
    'bootstrap': [True, False]
}
# params = {
#     'n_estimators': [30, 50, 70, 100, 150], 
#     'max_depth': [None] + [*range(65, 120, 5)], 
#     'min_samples_split': [10, 20, 25, 30, 40, 45, 50, 100, 120, 150],
#     'max_features': ['sqrt', 'log2'],
#     'bootstrap': [True, False]
# }

rf_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(tfidf_vectors, y_opt)

In [59]:
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

Best Score
0.7732871525956247
Best Params
{'n_estimators': 300, 'max_depth': None, 'max_features': 0.2, 'bootstrap': True, 'min_samples_split': 10}


In [60]:
RF = RandomForestClassifier(criterion='gini', **rf_clf.best_params_, random_state=0)

In [61]:
RF.fit(tra_vectors, y_tra)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [62]:
y_pred = RF.predict(test_vectors)

In [38]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_test, y_pred)

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90       325
           1       0.76      0.59      0.67       116

   micro avg       0.84      0.84      0.84       441
   macro avg       0.81      0.76      0.78       441
weighted avg       0.84      0.84      0.84       441



In [65]:
scikit_f1_score(y_test, y_pred, average='macro')

0.7822222222222223

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

###  SVC

In [50]:
from sklearn.svm import SVC

In [None]:
classifier =  SVC()
params = {
    'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'C': [0.025, 0.25, 0.5, 1, 2, 3, 4, 5, 6, 7],
}

svc_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(tfidf_vectors, y_opt)

In [48]:
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

Best Score
0.749810978944412
Best Params
{'kernel': 'linear', 'C': 3}


In [51]:
SV = SVC(**svc_clf.best_params_, random_state=0)

In [52]:
SV.fit(tra_vectors, y_tra)

SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [53]:
y_pred = SV.predict(test_vectors)

In [None]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_test, y_pred)

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       325
           1       0.64      0.47      0.54       116

   micro avg       0.79      0.79      0.79       441
   macro avg       0.73      0.69      0.70       441
weighted avg       0.78      0.79      0.78       441



In [55]:
scikit_f1_score(y_test, y_pred, average='macro')

0.7025513196480938

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

# Read the pickle

In [56]:
with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)



In [57]:
opt_results.keys()

dict_keys(['RandomForest', 'DecisionTree', 'SVC'])