In [82]:
import numpy as np
import pandas as pd
import pickle

In [83]:
from sklearn.feature_selection import SelectPercentile, chi2

In [84]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict

In [85]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [87]:
from scipy.sparse import hstack

In [88]:
from sklearn.feature_selection import SelectKBest

In [89]:
from sklearn.metrics import classification_report

## Load Data

In [90]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [92]:
df = df[np.logical_not(np.isnan(np.array(df['label'])))]

In [93]:
df = df

In [95]:
pos_tagset = ['-LRB-', '-LRB-', ',', ':', '\'\'', '""', '#', '``', '$', 
              'ADD', 'AFX', 'BES', 'CC', 'CD', 'DT', 'EX', 'FW', 'GW',
              'HVS', 'HYPH', 'IN',
              'JJ', 'JJR', 'JJS', 'LS', 'MD',
              'NFP', 'NIL', 'NN', 'NNS', 'NNP', 'NNPS',
              'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', '_SP',
              'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 
              'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX']

In [96]:
len(pos_tagset)

55

In [97]:
df.shape

(8337, 5)

In [98]:
y = np.array(df['label'])

In [93]:
### Named Entity Features

import spacy

nlp = spacy.load('en')

ner_tagset = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
              'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
              'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

def bag_of_entities(sentence, ner_tagset):
    entities = [token.label_ for token in nlp(sentence).ents]
    # frequency word count
    bag = np.zeros(len(ner_tagset))
    for ent in entities:
        for i, entity in enumerate(ner_tagset):
            if ent==entity:
                bag[i] += 1
    return np.array(bag)

ner_features = []
for i,row in df.iterrows():
    ner_features.append(bag_of_entities(row['sentence'], ner_tagset))

# Classifiers

In [98]:
# EXPERIMENT 1
# selected_and optimized TFidf + ner features 
# Same 

    #feature_path = 'Data/feature_list_optimized_Tf_idf_ner_features_sparse_matrix.pickle'
    #X = pd.read_pickle(feature_path)
    #classifier = SVC(C=2, kernel='linear',gamma='auto')

# Tf-idf vectorizer parameters used for eliminating stopwords, for eliminating most common and rare words
# Feature selection is applied on Tf-idf vectorizer
# Combined with Ner features
vectorizer = TfidfVectorizer(min_df=0.002, max_df=0.95, stop_words='english')
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

selected_tfidf_vectors = SelectPercentile(chi2, percentile=80).fit_transform(tfidf_vectors, y)

X = hstack((selected_tfidf_vectors, np.array(ner_features)))

y_true, y_pred = y, cross_val_predict(classifier, X, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))

              precision    recall  f1-score   support

         0.0       0.89      0.96      0.93      6876
         1.0       0.64      0.44      0.52      1299
         2.0       0.37      0.06      0.11       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.63      0.49      0.52      8337
weighted avg       0.84      0.86      0.85      8337


Precision: 0.842500530625976

Recall: 0.863260165527168

F1-score: 0.8467817539220636

Accuracy: 0.863260165527168


In [99]:
# EXPERIMENT 2
# selected TFidf + ner features
# Feature selection is applied on Tf-idf vectorizer
# Combined with Ner features

    #feature_path = 'Data/feature_list_Tf_idf_ner_features_sparse_matrix.pickle'
    #X = pd.read_pickle(feature_path)

    
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

selected_tfidf_vectors = SelectPercentile(chi2, percentile=70).fit_transform(tfidf_vectors, y)

X = hstack((selected_tfidf_vectors, np.array(ner_features)))

y_true, y_pred = y, cross_val_predict(classifier, X, y, n_jobs=5, cv=5)

    
    
classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, X, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))


              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.66      0.40      0.50      1299
         2.0       0.68      0.10      0.18       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.74      0.49      0.54      8337
weighted avg       0.85      0.86      0.85      8337


Precision: 0.8471684848202728

Recall: 0.8642197433129423

F1-score: 0.8450480045161078

Accuracy: 0.8642197433129423


In [101]:
# EXPERIMENT 3
 # Tf-idf vectorizer parameters used for eliminating stopwords, for eliminating most common and rare words
 # Only Tf-idf vectors as Features 
 # No feature selection applied

vectorizer = TfidfVectorizer(min_df=0.002, max_df=0.95, stop_words='english')
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))


              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.62      0.43      0.51      1299
         2.0       0.28      0.05      0.08       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.60      0.48      0.50      8337
weighted avg       0.84      0.86      0.84      8337


Precision: 0.8364146507357367

Recall: 0.8593019071608492

F1-score: 0.8424377593873059

Accuracy: 0.8593019071608492


In [103]:
# EXPERIMENT 4
 # Tf-idf vectorizer parameters used for eliminating stopwords, for eliminating most common and rare words
 # Only Tf-idf vectors as Features 
 # Feature selection applied

    
## Tf idf vectors as Features with selection

vectorizer = TfidfVectorizer(min_df=0.002, max_df=0.95, stop_words='english')
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

selected_tfidf_vectors = SelectPercentile(chi2, percentile=70).fit_transform(tfidf_vectors, y)

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, selected_tfidf_vectors, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))


              precision    recall  f1-score   support

         0.0       0.89      0.96      0.93      6876
         1.0       0.65      0.44      0.52      1299
         2.0       0.35      0.05      0.09       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.63      0.48      0.51      8337
weighted avg       0.84      0.86      0.85      8337


Precision: 0.8428508662221785

Recall: 0.8643396905361641

F1-score: 0.8470915834045074

Accuracy: 0.8643396905361641


In [104]:
# EXPERIMENT 5
 # Default Tf-Idf vectorizer
 # Only Tf-idf vectors as Features 
 # No feature selection applied


vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

#selected_tfidf_vectors = SelectPercentile(chi2, percentile=80).fit_transform(tfidf_vectors, y)

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))




              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.66      0.41      0.51      1299
         2.0       0.78      0.13      0.22       162

   micro avg       0.87      0.87      0.87      8337
   macro avg       0.78      0.50      0.55      8337
weighted avg       0.85      0.87      0.85      8337


Precision: 0.8503150320112609

Recall: 0.8652992683219384

F1-score: 0.8475539610612581

Accuracy: 0.8652992683219384


In [105]:
# EXPERIMENT 6
 # Default Tf-Idf vectorizer
 # Only Tf-idf vectors as Features 
 # Feature selection applied

vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

selected_tfidf_vectors = SelectPercentile(chi2, percentile=80).fit_transform(tfidf_vectors, y)

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, selected_tfidf_vectors, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))




              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.67      0.41      0.51      1299
         2.0       0.77      0.12      0.21       162

   micro avg       0.87      0.87      0.87      8337
   macro avg       0.78      0.50      0.55      8337
weighted avg       0.85      0.87      0.85      8337


Precision: 0.8512740379778139

Recall: 0.8662588461077126

F1-score: 0.847714654003909

Accuracy: 0.8662588461077126


In [112]:
# EXPERIMENT 7
 # Default Tf-Idf vectorizer
 # Only Tf-idf vectors as Features 
 # Feature selection applied
 # Instead of select Percentile selectKbest used


vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

selected_tfidf_vectors = SelectKBest(chi2, k=9000).fit_transform(tfidf_vectors, y)

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, selected_tfidf_vectors, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))

              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.70      0.40      0.51      1299
         2.0       0.78      0.11      0.19       162

   micro avg       0.87      0.87      0.87      8337
   macro avg       0.79      0.50      0.54      8337
weighted avg       0.85      0.87      0.85      8337


Precision: 0.8545733100435131

Recall: 0.8686577905721482

F1-score: 0.8489762945031933

Accuracy: 0.8686577905721482


In [None]:
#############################

In [101]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [20]:
# EXPERIMENT 8 
min_df_list = [0]# 0.001, 0.003, 0.005, 0.007
max_df_list = [1.0, 0.80, 0.6]
for i in min_df_list:
    for j in max_df_list:
        vectorizer = TfidfVectorizer(min_df=i, max_df=j, stop_words=number_stopwords)
        tfidf_vectors = vectorizer.fit_transform(df['sentence'])
        classifier = SVC(C=3, kernel='linear',gamma='auto')
        y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

        print('----------------------------------')
        print('i:   ',  i)
        print('j:   ',  j)
        print(classification_report(y_true, y_pred))


----------------------------------
i:    0
j:    1.0
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.63      0.45      0.53      1299
         2.0       0.77      0.14      0.24       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.76      0.52      0.56      8337
weighted avg       0.85      0.86      0.85      8337

----------------------------------
i:    0
j:    0.8
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.63      0.45      0.53      1299
         2.0       0.77      0.14      0.24       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.76      0.52      0.56      8337
weighted avg       0.85      0.86      0.85      8337

----------------------------------
i:    0
j:    0.6
              precision    recall  f1-score   support

         0.0       0.89 

In [21]:
# EXPERIMENT 8 
min_df_list = [0.001]# , 0.003, 0.005, 0.007
max_df_list = [1.0, 0.80, 0.6]
for i in min_df_list:
    for j in max_df_list:
        vectorizer = TfidfVectorizer(min_df=i, max_df=j, stop_words=number_stopwords)
        tfidf_vectors = vectorizer.fit_transform(df['sentence'])
        classifier = SVC(C=3, kernel='linear',gamma='auto')
        y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

        print('----------------------------------')
        print('i:   ',  i)
        print('j:   ',  j)
        print(classification_report(y_true, y_pred))


----------------------------------
i:    0.001
j:    1.0
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92      6876
         1.0       0.61      0.46      0.52      1299
         2.0       0.57      0.17      0.27       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.69      0.53      0.57      8337
weighted avg       0.84      0.86      0.85      8337

----------------------------------
i:    0.001
j:    0.8
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92      6876
         1.0       0.61      0.46      0.52      1299
         2.0       0.57      0.17      0.27       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.69      0.53      0.57      8337
weighted avg       0.84      0.86      0.85      8337

----------------------------------
i:    0.001
j:    0.6
              precision    recall  f1-score   support

         0.0

In [22]:
# EXPERIMENT 8 
min_df_list = [0.003]# , 0.003, 0.005, 0.007
max_df_list = [1.0, 0.80, 0.6]
for i in min_df_list:
    for j in max_df_list:
        vectorizer = TfidfVectorizer(min_df=i, max_df=j, stop_words=number_stopwords)
        tfidf_vectors = vectorizer.fit_transform(df['sentence'])
        classifier = SVC(C=3, kernel='linear',gamma='auto')
        y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

        print('----------------------------------')
        print('i:   ',  i)
        print('j:   ',  j)
        print(classification_report(y_true, y_pred))


----------------------------------
i:    0.003
j:    1.0
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.63      0.43      0.51      1299
         2.0       0.60      0.15      0.25       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.71      0.51      0.56      8337
weighted avg       0.84      0.86      0.85      8337

----------------------------------
i:    0.003
j:    0.8
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.63      0.43      0.51      1299
         2.0       0.60      0.15      0.25       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.71      0.51      0.56      8337
weighted avg       0.84      0.86      0.85      8337

----------------------------------
i:    0.003
j:    0.6
              precision    recall  f1-score   support

         0.0

In [23]:
# EXPERIMENT 8 
min_df_list = [0.005]# , 0.003, 0.005, 0.007
max_df_list = [1.0, 0.80, 0.6]
for i in min_df_list:
    for j in max_df_list:
        vectorizer = TfidfVectorizer(min_df=i, max_df=j, stop_words=number_stopwords)
        tfidf_vectors = vectorizer.fit_transform(df['sentence'])
        classifier = Random()
        y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

        print('----------------------------------')
        print('i:   ',  i)
        print('j:   ',  j)
        print(classification_report(y_true, y_pred))


----------------------------------
i:    0.005
j:    1.0
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.65      0.41      0.51      1299
         2.0       0.64      0.17      0.27       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.73      0.52      0.57      8337
weighted avg       0.85      0.86      0.85      8337

----------------------------------
i:    0.005
j:    0.8
              precision    recall  f1-score   support

         0.0       0.89      0.97      0.93      6876
         1.0       0.65      0.41      0.51      1299
         2.0       0.64      0.17      0.27       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.73      0.52      0.57      8337
weighted avg       0.85      0.86      0.85      8337

----------------------------------
i:    0.005
j:    0.6
              precision    recall  f1-score   support

         0.0

In [72]:

from sklearn.ensemble import RandomForestClassifier
# EXPERIMENT 8 
min_df_list = [0.005, 0.0002, 0.03, 0.007]
max_df_list = [1.0, 0.80, 0.6]
for i in min_df_list:
    for j in max_df_list:
        vectorizer = TfidfVectorizer(min_df=i, max_df=j, stop_words=number_stopwords)
        tfidf_vectors = vectorizer.fit_transform(df['sentence'])
        classifier = RandomForestClassifier()
        y_true, y_pred = y, cross_val_predict(classifier, tfidf_vectors, y, n_jobs=5, cv=5)

        print('----------------------------------')
        print('i:   ',  i)
        print('j:   ',  j)
        print(classification_report(y_true, y_pred))


----------------------------------
i:    0.005
j:    1.0
              precision    recall  f1-score   support

         0.0       0.86      0.98      0.92      6876
         1.0       0.64      0.27      0.38      1299
         2.0       0.33      0.02      0.05       162

   micro avg       0.85      0.85      0.85      8337
   macro avg       0.61      0.42      0.45      8337
weighted avg       0.82      0.85      0.82      8337

----------------------------------
i:    0.005
j:    0.8
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92      6876
         1.0       0.64      0.29      0.40      1299
         2.0       0.21      0.02      0.03       162

   micro avg       0.85      0.85      0.85      8337
   macro avg       0.57      0.43      0.45      8337
weighted avg       0.82      0.85      0.82      8337

----------------------------------
i:    0.005
j:    0.6
              precision    recall  f1-score   support

         0.0

In [103]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report
scoring = 'f1_macro'
n_jobs=20


In [80]:
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
       # ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', RandomForestClassifier())
        ])

params = {
    'tfidf__max_df':(0.999, 0.98, 0.8, 0.60),
    'tfidf__min_df':(0, 0.00009, 0.0001 ,0.0003, 0.0005, 0.0009, 0.001, 0.002),
    'tfidf__stop_words': ('english',None, number_stopwords),
    #'feat_sel__percentile': (10, 90, 100),
}
          
rf_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(df['sentence'], y)



print(rf_clf.best_estimator_)
print(rf_clf.best_params_)


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.001,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
{'tfidf__max_df': 0.6, 'tfidf__min_df': 0.001, 'tfidf__stop_words': 'english'}


In [None]:
print(rf_clf.best_score_)

In [99]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
       # ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', MLPClassifier())
        ])

params = {
    'tfidf__max_df':(0.999, 0.98, 0.8, 0.60),
    'tfidf__min_df':(0, 0.00009, 0.0001 ,0.0003, 0.0005, 0.0009, 0.001, 0.002),
    'tfidf__stop_words': ('english',None, number_stopwords),
    #'feat_sel__percentile': (10, 90, 100),
}
          
mlp_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
mlp_clf = mlp_clf.fit(df['sentence'], y)

print(mlp_clf.best_estimator_)
print(mlp_clf.best_params_)


In [None]:
## Only Ner Features

In [113]:
### Named Entity Features

import spacy

nlp = spacy.load('en')

ner_tagset = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
              'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
              'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

def bag_of_entities(sentence, ner_tagset):
    entities = [token.label_ for token in nlp(sentence).ents]
    # frequency word count
    bag = np.zeros(len(ner_tagset))
    for ent in entities:
        for i, entity in enumerate(ner_tagset):
            if ent==entity:
                bag[i] += 1
    return np.array(bag)



In [114]:
ner_features = []
for i,row in df.iterrows():
    ner_features.append(bag_of_entities(row['sentence'], ner_tagset))


In [91]:
### only ner features as a feature

# it seems that with the ner features classifier predicts only one label 0
# this means that it classifies everything as 0 
# really bad classifier

classifier = SVC(C=2, kernel='linear',gamma='auto')

y_true, y_pred = y, cross_val_predict(classifier, ner_features, y, n_jobs=5, cv=5)

print(classification_report(y_true, y_pred))

print('\nPrecision:', str(precision_score(y_true, y_pred, average='weighted')))
print('\nRecall:', str(recall_score(y_true, y_pred, average='weighted')))
print('\nF1-score:', str(f1_score(y_true, y_pred, average='weighted')))
print('\nAccuracy:', str(accuracy_score(y_true, y_pred)))

              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90      6876
         1.0       0.00      0.00      0.00      1299
         2.0       0.00      0.00      0.00       162

   micro avg       0.82      0.82      0.82      8337
   macro avg       0.27      0.33      0.30      8337
weighted avg       0.68      0.82      0.75      8337


Precision: 0.6802242853374814

Recall: 0.8247571068729759

F1-score: 0.7455504985024102

Accuracy: 0.8247571068729759


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# First Results

### Decision Tree

In [27]:

# based on mean accuracy 
print('Best Estimator')
print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)
print('cv_results_')
print(dt_clf.cv_results_)

Best Estimator
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=9, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best Score
0.84
Best Params
{'max_features': None, 'min_samples_leaf': 9, 'min_samples_split': 50, 'max_depth': None}
cv_results_
{'std_score_time': array([1.28795703e-04, 1.10691725e-04, 8.41046672e-06, 1.74941409e-06,
       1.16328722e-05, 1.88697423e-06, 3.76248389e-06, 1.72675633e-05,
       7.86911347e-06, 4.93382465e-06, 1.90996944e-05, 2.07123045e-05,
       4.10190833e-06, 6.79321435e-06, 1.12217929e-05, 2.43193495e-05,
       5.99299139e-06, 7.38251109e-06, 2.72792533e-05, 3.14640268e-06,
       4.81959487e-06, 4.76503255e-06, 7.32531142e-07, 1.07695549e-05,
       7.85696843e-06, 8.24059597e-06, 6.59174555e-06, 1.12

###  SVC

In [30]:

# based on mean accuracy 
print('Best Estimator')
print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)
print('cv_results_')
print(svc_clf.cv_results_)

Best Estimator
SVC(C=2, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best Score
0.84
Best Params
{'C': 2, 'gamma': 'auto', 'kernel': 'linear'}
cv_results_
{'std_score_time': array([2.16889139e-04, 1.38510533e-04, 3.09726198e-05, 2.14263900e-05,
       5.57989130e-05, 3.43477029e-05, 4.65228880e-05, 1.95758931e-05,
       5.32739153e-05, 3.34231896e-05, 6.71920239e-05, 3.00049190e-05,
       4.78759705e-05, 2.76447962e-05, 2.60779796e-05, 2.18303946e-05,
       4.77634241e-05, 9.25258980e-05, 4.46474933e-05, 2.34298391e-05,
       5.39324960e-05, 3.87582089e-05, 5.74755150e-05, 3.06679330e-05,
       2.54161810e-05, 2.39011980e-05, 3.93314065e-05, 2.01666606e-05,
       4.59351864e-05, 6.69503823e-05, 4.48189896e-05, 2.30451013e-05,
       4.53205060e-05, 3.54260165e-05, 1.28438689e-04, 3.09367744e-05,
       5.333538

### KNeighborsClassifier

In [33]:
# based on mean accuracy 
print('Best Estimator')
print(knn_clf.best_estimator_)
print('Best Score')
print(knn_clf.best_score_)
print('Best Params')
print(knn_clf.best_params_)
print('cv_results_')
print(knn_clf.cv_results_)

Best Estimator
KNeighborsClassifier(algorithm='auto', leaf_size=20, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=1,
           weights='uniform')
Best Score
0.83
Best Params
{'p': 1, 'leaf_size': 20, 'n_neighbors': 3}
cv_results_
{'std_score_time': array([0.00579178, 0.00111061, 0.00834335]), 'std_test_score': array([0.01413566, 0.03152852, 0.03309715]), 'std_train_score': array([0.01146557, 0.00829204, 0.008288  ]), 'split3_train_score': array([0.86335404, 0.86956522, 0.86956522]), 'split0_train_score': array([0.88607595, 0.87341772, 0.87974684]), 'params': [{'p': 1, 'leaf_size': 20, 'n_neighbors': 3}, {'p': 2, 'leaf_size': 20, 'n_neighbors': 3}, {'p': 3, 'leaf_size': 20, 'n_neighbors': 3}], 'split1_train_score': array([0.8625, 0.85  , 0.8625]), 'mean_train_score': array([0.86507233, 0.86500963, 0.87125993]), 'split4_train_score': array([0.85093168, 0.86956522, 0.88198758]), 'split3_test_score': array([0.84615385, 0.84615385, 0.82051282]), 'param_n_

### RandomForestClassifier

In [40]:
# based on mean accuracy 

print('Best Estimator')
print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)
print('cv_results_')
print(rf_clf.cv_results_)

Best Estimator
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=50,
            min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Score
0.84
Best Params
{'bootstrap': True, 'min_samples_split': 50, 'n_estimators': 70, 'max_features': None, 'min_samples_leaf': 5, 'max_depth': 15}
cv_results_
{'std_train_score': array([0.00804008, 0.00710157, 0.010039  , ..., 0.00356013, 0.00356013,
       0.00356013]), 'std_score_time': array([3.68444781e-04, 1.05745174e-04, 1.56553896e-04, ...,
       9.10548490e-05, 2.06558075e-04, 2.58913505e-04]), 'std_test_score': array([0.01870339, 0.01870339, 0.01870339, ..., 0.01413566, 0.01413566,
       0.01413566]), 'param_n_estimators': masked_array(dat

### MLPClassifier

In [44]:
print('Best Estimator')
print(mlp_clf.best_estimator_)
print('Best Score')
print(mlp_clf.best_score_)
print('Best Params')
print(mlp_clf.best_params_)
print('cv_results_')
print(mlp_clf.cv_results_)

Best Estimator
MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 30), learning_rate='invscaling',
       learning_rate_init=0.01, max_iter=400, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
Best Score
0.845
Best Params
{'max_iter': 400, 'activation': 'logistic', 'learning_rate': 'invscaling', 'hidden_layer_sizes': (50, 30), 'learning_rate_init': 0.01}
cv_results_
{'std_train_score': array([0.        , 0.        , 0.        , 0.18647808, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00248447,
       0.        , 0.        , 0.21667163, 0.02200405, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02958088, 0.        , 0.        , 0.  

# Last_results

# Result Analyze

In [4]:
import pickle

In [45]:
opt_results_path = 'Results/optimization_results_tfidf_only.pickle'

In [46]:
with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)

In [54]:
import pprint

In [58]:
pprint.pprint(opt_results['DecisionTree']['classif_report'])

('              precision    recall  f1-score   support\n'
 '\n'
 '         0.0       0.89      0.93      0.91      6876\n'
 '         1.0       0.53      0.45      0.48      1299\n'
 '         2.0       0.36      0.10      0.16       162\n'
 '\n'
 '   micro avg       0.84      0.84      0.84      8337\n'
 '   macro avg       0.59      0.50      0.52      8337\n'
 'weighted avg       0.83      0.84      0.83      8337\n')


In [80]:

opt_results.keys()

dict_keys(['RandomForest', 'SVC', 'DecisionTree'])

In [60]:
opt_results_path_ner_pos = 'Results/optimization_results_tfidf_ner_pos_IPYNB.pickle'
with open(opt_results_path_ner_pos, 'rb') as file_:
    opt_results_ner_pos = pickle.load(file_)

In [61]:
opt_results_ner_pos.keys()

dict_keys(['RandomForest', 'DecisionTree', 'SVC', 'KNeighbors'])

In [62]:
opt_results_ner_pos['RandomForest']['GridSearchCV'].best_params_

{'clf__bootstrap': False,
 'clf__max_depth': 110,
 'clf__max_features': 'sqrt',
 'clf__min_samples_split': 40,
 'clf__n_estimators': 30,
 'feat_sel__percentile': 10}

In [68]:
pprint.pprint(opt_results_ner_pos['SVC']['classif_report'])

('              precision    recall  f1-score   support\n'
 '\n'
 '         0.0       0.90      0.95      0.92      6876\n'
 '         1.0       0.62      0.47      0.53      1299\n'
 '         2.0       0.54      0.12      0.20       162\n'
 '\n'
 '   micro avg       0.86      0.86      0.86      8337\n'
 '   macro avg       0.69      0.52      0.55      8337\n'
 'weighted avg       0.85      0.86      0.85      8337\n')


In [78]:
opt_results_ner_pos['KNeighbors']['GridSearchCV'].best

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=10, score_func=<function chi2 at 0x2b438a135ae8>)), ('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid='warn', n_jobs=20,
       param_grid={'clf__p': [1, 2, 3], 'feat_sel__percentile': (10, 90, 100), 'clf__leaf_size': [20, 30, 40, 50, 60], 'clf__n_neighbors': [3, 5, 9, 13, 19, 25, 35, 55, 63]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=0)

In [79]:
opt_results_ner_pos.keys()

dict_keys(['RandomForest', 'DecisionTree', 'SVC', 'KNeighbors'])