In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

## Load Data

In [4]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [5]:
df = df[np.logical_not(np.isnan(np.array(df['label'])))]

In [6]:
y = np.array(df['label'])

In [7]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [8]:
scoring = 'f1_macro'
n_jobs=20


# Feature Extraction

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.0001
    max_df: 0.6
    stop_words: num_stopwords

In [13]:
vectorizer = TfidfVectorizer(min_df=0.0001, max_df=0.6, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

#### Saving Feature Vectors

In [14]:
import pickle

In [15]:
feature_path = 'Data/features_mindf_0001_maxdf_6_number_stopwords.pickle'

In [17]:
with open(feature_path, 'wb') as file_:
    pickle.dump(tfidf_vectors, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
X = pd.read_pickle(feature_path)

#### Memory Cleaning

In [None]:
del tfidf_vectors
del df
del nlp

# Classifier Training
- With hyper-parameter optimization

https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin

In [22]:
opt_results = {}
opt_results_path = 'Results/optimization_results_tfidf_only.pickle'

#### Important Note on the Scoring of Parameter Optimization
GridSearchCV uses mean accuracy by default. However, we have chosen "f1_macro" scoring for hyper-parameter optimization, because mean accuracy or f1_micro is measuring the performance on the total labels, disregarding the type of the label. So, when we use accuracy or f1_micro, we get high scores because most of the labels are 0 and classifier predicts most of the labels as 0. When the accuracy score for the label 0 is high, the overall result becomes high as well, eventhough the other labels perform low. And this kind of high score doesn't mean our classifier performs better because we are actually interested in getting high scores on label 1 and 2.


### Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', DecisionTreeClassifier(criterion='gini'))
        ])

In [25]:
# Decision Tree
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__max_depth': [None] + [*range(15, 35, 5)],
    'clf__min_samples_split': [*range(50, 200, 40)],
    'clf__min_samples_leaf': [*range(5, 14, 2)],
    'clf__max_features': [None, 'sqrt', 'log2']
}

dt_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
dt_clf = dt_clf.fit(X, y)

In [26]:
print('Best Estimator')
print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)

Best Estimator
Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=100, score_func=<function chi2 at 0x2b651a28b488>)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
Best Score
0.5189803679237573
Best Params
{'clf__max_features': None, 'clf__max_depth': None, 'clf__min_samples_leaf': 5, 'feat_sel__percentile': 100, 'clf__min_samples_split': 50}


In [27]:
y_true, y_pred = y, cross_val_predict(dt_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [28]:
opt_results['DecisionTree'] = {}
opt_results['DecisionTree']['GridSearchCV'] = dt_clf
opt_results['DecisionTree']['classif_report'] = classification_report(y_true, y_pred)

In [29]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.93      0.91      6876
         1.0       0.53      0.45      0.48      1299
         2.0       0.36      0.10      0.16       162

   micro avg       0.84      0.84      0.84      8337
   macro avg       0.59      0.50      0.52      8337
weighted avg       0.83      0.84      0.83      8337



In [30]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

0.8481468154012235
{'max_depth': 20, 'min_samples_split': 100, 'max_features': None, 'min_samples_leaf': 9}
699
<function _passthrough_scorer at 0x2ba712f58950>
5
0.30017995834350586

### RandomForestClassifier

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', RandomForestClassifier(criterion='gini'))
        ])

In [33]:
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__n_estimators': [30, 70, 100, 150], 
    'clf__max_depth': [None] + [*range(65, 120, 15)], 
    'clf__min_samples_split': [25, 30, 40, 45, 50, 100],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__bootstrap': [True, False]
}

rf_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(X, y)

Best Estimator
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Score
0.860501379393067
Best Params
{'bootstrap': False, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 30, 'min_samples_split': 40, 'min_samples_leaf': 1}

In [34]:
print('Best Estimator')
print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

Best Estimator
Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=10, score_func=<function chi2 at 0x2b651a28b488>)), ('clf', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_im...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
Best Score
0.47254814698253395
Best Params
{'clf__max_features': 'sqrt', 'clf__bootstrap': False, 'feat_sel__percentile': 10, 'clf__max_depth': None, 'clf__min_samples_split': 45, 'clf__n_estimators': 30}


In [35]:
y_true, y_pred = y, cross_val_predict(rf_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [36]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_true, y_pred)

In [37]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93      6876
         1.0       0.66      0.36      0.47      1299
         2.0       0.25      0.01      0.01       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.60      0.45      0.47      8337
weighted avg       0.83      0.86      0.84      8337



In [38]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

###  SVC

In [39]:
from sklearn.svm import SVC

In [40]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC())
        ])

In [41]:
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'clf__C': [0.025, 0.25, 0.5, 1, 2, 3],
}
          
svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(X, y)

In [42]:
print('Best Estimator')
print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

Best Estimator
Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=90, score_func=<function chi2 at 0x2b651a28b488>)), ('clf', SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])
Best Score
0.5527827921091951
Best Params
{'clf__kernel': 'linear', 'clf__C': 3, 'feat_sel__percentile': 90}


In [43]:
y_true, y_pred = y, cross_val_predict(svc_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [44]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_true, y_pred)

In [45]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92      6876
         1.0       0.63      0.45      0.52      1299
         2.0       0.75      0.13      0.22       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.76      0.51      0.56      8337
weighted avg       0.85      0.86      0.85      8337



In [46]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

0.8642197433129423
{'C': 2, 'gamma': 'auto', 'kernel': 'linear'} 'C':[0.025, 0.25, 0.5, 1, 2, 3, 5, 8, 10, 15, 20], 
48
<function _passthrough_scorer at 0x2ba712f58950>
5
9.254388332366943

### KNeighborsClassifier

In [54]:
from sklearn.neighbors import KNeighborsClassifier

In [55]:
classifier_knn = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', KNeighborsClassifier())
        ])

In [None]:
# p: Power parameter for the Minkowski metric. When p = 1, 
#    this is equivalent to using manhattan_distance (l1), 
#    and euclidean_distance (l2) for p = 2. 
#    For arbitrary p, minkowski_distance (l_p) is used.

params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__n_neighbors': [3, 5, 9, 13, 19, 25, 35, 55, 63], 
    'clf__leaf_size': [20, 30, 40, 50, 60],
    'clf__p': [1, 2, 3]
}
          
knn_clf = GridSearchCV(classifier_knn, params, cv=5, scoring=scoring, n_jobs=n_jobs)
knn_clf = knn_clf.fit(X.todense(), y)



In [None]:
print('Best Estimator')
print(knn_clf.best_estimator_)
print('Best Score')
print(knn_clf.best_score_)
print('Best Params')
print(knn_clf.best_params_)

In [None]:
y_true, y_pred = y, cross_val_predict(knn_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [None]:
opt_results['KNeighbors'] = {}
opt_results['KNeighbors']['GridSearchCV'] = knn_clf
opt_results['KNeighbors']['classif_report'] = classification_report(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
opt_results.keys()

### MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
classifier = Pipeline([
   #     ('tfidf', TfidfVectorizer()),
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', MLPClassifier())
        ])

In [None]:
params = {
  #  'tfidf__max_df':(0.999, 0.60),
  #  'tfidf__min_df':(0.0009, 0.001, 0.003),
  #  'tfidf__stop_words': ('english',None, number_stopwords),
    'feat_sel__percentile': (10, 90, 100),
    'clf__hidden_layer_sizes': [(10,5), (20,10), (20), (30,20), (50,30)], 
    'clf__activation': ['tanh', 'relu', 'logistic'], 
    'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'clf__learning_rate_init': [0.01, 0.001, 0.1],
    'clf__max_iter': [50, 200, 400]
}
                        
mlp = MLPClassifier()
mlp_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
mlp_clf = mlp_clf.fit(X, y)

In [None]:
print('Best Estimator')
print(clf.best_estimator_)
print('Best Score')
print(clf.best_score_)
print('Best Params')
print(clf.best_params_)

In [None]:
y_true, y_pred = y, cross_val_predict(mlp_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [None]:
opt_results['MLP'] = {}
opt_results['MLP']['GridSearchCV'] = mlp_clf
opt_results['MLP']['classif_report'] = classification_report(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### Kmeans

# All classifiers