In [23]:
import numpy as np
# import pandas as pd

In [24]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

In [26]:
import pickle

In [27]:
import pprint
import os
import sys
sys.path.append('../')
from src.load_data import load_data

### Load Optimized Classifiers

In [115]:
opt_results = {}
opt_results_path = 'Results/results_070919_tfidf_only_mindf_001_maxdf_0_12_numberstoplwords.pickle'
contextual_classifiers_path = 'Results/results_110919_contextual_scikit.pickle'

In [154]:
with open(contextual_classifiers_path, 'rb') as file_:
    opt_results = pickle.load(file_)

opt_results.keys()

dict_keys(['single_sentence', 'contextual_'])

## Load Data

In [185]:
train_data, valid_data, test_data, metadata = load_data(project_folder='./../')

In [186]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [187]:
scoring = 'f1_macro'
n_jobs=5

In [188]:
tra_sents = np.array([sentence['sentence']
     for article in train_data
     for sentence in article['sentences']])
y_tra = np.array([sentence['label'] for article in train_data for sentence in article['sentences']])

In [189]:
test_sents =  np.array([sentence['sentence']
     for article in test_data
     for sentence in article['sentences']])

In [190]:
y_test = np.array([sentence['label'] for article in test_data for sentence in article['sentences']])

In [191]:
len(tra_sents), len(y_tra), len(test_sents), len(y_test)

(3582, 3582, 441, 441)

In [192]:
def get_contextual_sentences(data_, ww):

    X = []
    for article in data_:
        sent_list = article['sentences']
        for i, sentence in enumerate(sent_list):
            merged_sents = [sent_list[i]['sentence'].replace('\n', '').strip()]
            for w in range(1, ww+1):
                if i - w >= 0:
                    merged_sents = [sent_list[i-w]['sentence'].replace('\n', '').strip()] + merged_sents
                if i + w < len(sent_list):
                    merged_sents.append(sent_list[i+w]['sentence'].replace('\n', '').strip())
            X.append(' '.join(merged_sents))

    return np.array(X)

In [249]:
ww=5
X_tra = get_contextual_sentences(train_data, ww)
X_val = get_contextual_sentences(valid_data, ww)
X_test = get_contextual_sentences(test_data, ww)

In [250]:
len(X_tra[5])

1891

In [251]:
def get_y(data_, to_categorize):

    y = np.array([sentence['label'] for article in data_ for sentence in article['sentences']])
    if to_categorize:
        y = to_categorical(y)
    
    return y

In [252]:
y_tra = get_y(train_data, False)
y_val = get_y(valid_data, False)
y_test = get_y(test_data, False)

In [253]:
X_tra_val = np.concatenate((X_tra, X_val))

# Feature Extraction

In [254]:
from sklearn.feature_extraction.text import TfidfVectorizer

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.001
    max_df: 0.6
    stop_words: num_stopwords

In [255]:
vectorizer = TfidfVectorizer(min_df=0.001, max_df=0.12, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(X_tra_val)

In [256]:
tra_vectors = vectorizer.transform(X_tra)
test_vectors = vectorizer.transform(X_test)

In [257]:
len(vectorizer.vocabulary_)

10642

#### Saving Feature Vectors

In [258]:
import pickle

feature_path = 'Data/features_mindf_0_001_maxdf_0_12_number_stopwords.pickle'

with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [259]:
tfidf_vectors.shape

(3981, 10642)

# Classifier Training
- With hyper-parameter optimization

### Decision Tree

In [260]:
from sklearn.tree import DecisionTreeClassifier

In [261]:
dt_clf = opt_results['single_sentence']['DecisionTree']['GridSearchCV']
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)


Best Score
0.7580231977602293
Best Params
{'min_samples_split': 170, 'max_features': None, 'max_depth': None, 'min_samples_leaf': 5}


In [262]:
DT = DecisionTreeClassifier(criterion='gini', **dt_clf.best_params_, random_state=5)

In [263]:
DT.fit(tra_vectors, y_tra)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=170,
            min_weight_fraction_leaf=0.0, presort=False, random_state=5,
            splitter='best')

In [264]:
y_pred = DT.predict(test_vectors)

In [247]:
# with window size ww=1 prevcurrnext
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82       381
           1       0.22      0.43      0.30        60

   micro avg       0.72      0.72      0.72       441
   macro avg       0.56      0.60      0.56       441
weighted avg       0.80      0.72      0.75       441



In [148]:
# with window size ww=2 prevprevcurrnextnext
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83       363
           1       0.34      0.50      0.40        78

   micro avg       0.74      0.74      0.74       441
   macro avg       0.61      0.64      0.62       441
weighted avg       0.78      0.74      0.76       441



In [243]:
# with window size ww=3 3prevcurr3next
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.79      0.83       360
           1       0.34      0.49      0.41        81

   micro avg       0.73      0.73      0.73       441
   macro avg       0.61      0.64      0.62       441
weighted avg       0.78      0.73      0.75       441



In [265]:
# with window size ww=5 ..5prevcurr5next..
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.82      0.75      0.79       356
           1       0.24      0.33      0.28        85

   micro avg       0.67      0.67      0.67       441
   macro avg       0.53      0.54      0.53       441
weighted avg       0.71      0.67      0.69       441



In [266]:
if 'DecisionTree' not in opt_results['contextual_']:
    opt_results['contextual_']['DecisionTree'] = {}
opt_results['contextual_']['DecisionTree']['window_width_' + str(ww)] = {}
opt_results['contextual_']['DecisionTree']['window_width_' + str(ww)]['classif_report'] = classification_report(y_pred, y_test)
opt_results['contextual_']['DecisionTree']['window_width_' + str(ww)]['classifier'] = DT

In [267]:
opt_results['contextual_']['DecisionTree'].keys()

dict_keys(['window_width_1', 'window_width_2', 'window_width_3', 'window_width_5'])

In [268]:
with open(contextual_classifiers_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### RandomForestClassifier

In [33]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(criterion='gini')


params = {
    'n_estimators': [50] + [*range(100, 1000, 100)], 
    'max_depth': [None] + [*range(65, 120, 5)], 
    'min_samples_split': [10, 20, 30, 40, 45, 50, 100, 120, 150],
    'max_features': ['sqrt', 'log2', int(3010*0.1), int(3010*0.2), int(3010*0.3)],
    'bootstrap': [True, False]
}
# params = {
#     'n_estimators': [30, 50, 70, 100, 150], 
#     'max_depth': [None] + [*range(65, 120, 5)], 
#     'min_samples_split': [10, 20, 25, 30, 40, 45, 50, 100, 120, 150],
#     'max_features': ['sqrt', 'log2'],
#     'bootstrap': [True, False]
# }

rf_clf = GridSearchCV(classifier, params, cv=5, refit=False, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(tfidf_vectors, y_opt)

# print('Best Estimator')
# print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

In [None]:
RF = RandomForestClassifier(criterion='gini', **rf_clf.best_params_, random_state=0)

In [None]:
RF.fit(tra_vectors, y_tra)

In [None]:
y_pred = RF.predict(test_vectors)

In [None]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))