In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [3]:
df = df[np.logical_not(np.isnan(np.array(df['label'])))]

In [4]:
y = np.array(df['label'])

# Feature Extraction

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectorizer = TfidfVectorizer(min_df=0.002, max_df=0.95, stop_words='english')
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

In [8]:
tfidf_vectors

<8337x1320 sparse matrix of type '<class 'numpy.float64'>'
	with 68153 stored elements in Compressed Sparse Row format>

### Feature Selection on TF-IDF Vectors

In [9]:
from sklearn.feature_selection import SelectPercentile, chi2

In [10]:
tfidf_vectors = SelectPercentile(chi2, percentile=80).fit_transform(tfidf_vectors, y)

In [11]:
tfidf_vectors.shape

(8337, 1056)

### Named Entity Features

In [12]:
import spacy

In [13]:
nlp = spacy.load('en')

In [14]:
ner_tagset = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
              'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
              'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

In [15]:
def bag_of_entities(sentence, ner_tagset):
    entities = [token.label_ for token in nlp(sentence).ents]
    # frequency word count
    bag = np.zeros(len(ner_tagset))
    for ent in entities:
        for i, entity in enumerate(ner_tagset):
            if ent==entity:
                bag[i] += 1
    return np.array(bag)

In [None]:
ner_features = []
for i,row in df.iterrows():
    ner_features.append(bag_of_entities(row['sentence'], ner_tagset))

### Combining TF-IDF Vectors and Named Entity Features

In [17]:
from scipy.sparse import hstack

In [None]:
X = hstack((tfidf_vectors, np.array(ner_features)))

#### Saving Feature Vectors

In [18]:
import pickle

In [19]:
feature_path = 'Data/feature_list_optimized_Tf_idf_ner_features_sparse_matrix.pickle'

In [20]:
X = pd.read_pickle(feature_path)

In [21]:
type(X)

scipy.sparse.coo.coo_matrix

#### Memory Cleaning

In [22]:
del tfidf_vectors
del df
del ner_features
del nlp

# Classifier Training
- With hyper-parameter optimization

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
opt_results = {}

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
# Decision Tree
params = {
    'max_depth': [None] + [*range(15, 35, 5)],
    'min_samples_split': [*range(50, 200, 40)],
    'min_samples_leaf': [*range(5, 14, 2)],
    'max_features': [None, 'sqrt', 'log2']
}

dt = DecisionTreeClassifier(criterion='gini')
dt_clf = GridSearchCV(dt, params, cv=5)
dt_clf = dt_clf.fit(X, y)

opt_results['DecisionTree'] = dt_clf

In [27]:
print('Best Estimator')
print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)
print('cv_results_')
print(dt_clf.cv_results_)

Best Estimator
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best Score
0.8518651793210987
Best Params
{'min_samples_split': 50, 'min_samples_leaf': 5, 'max_features': None, 'max_depth': 15}
cv_results_
{'param_min_samples_split': masked_array(data=[50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 130, 170, 50, 90, 130, 170, 50, 90, 130, 170,
                   50, 90, 1

0.8481468154012235
{'max_depth': 20, 'min_samples_split': 100, 'max_features': None, 'min_samples_leaf': 9}
699
<function _passthrough_scorer at 0x2ba712f58950>
5
0.30017995834350586

### RandomForestClassifier

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
#RandomForestClassifier
#RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)

#max_depth 
#min_samples_leaf 
#min_weight_fraction_leaf 
#max_features 
#max_leaf_nodes 
#min_impurity_decrease 
#min_impurity_split 
#oob_score
#warm_start 
#class_weight

params = {
    'n_estimators': [30, 70, 100, 150], 
    'max_depth': [None] + [*range(65, 120, 15)], 
    'min_samples_split': [25, 30, 40, 45, 50, 100],
    #'min_samples_leaf': [*range(5, 14, 2)],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}
          
rf = RandomForestClassifier(criterion='gini')
rf_clf = GridSearchCV(rf, params, cv=5)
rf_clf = rf_clf.fit(X, y)

opt_results['RandomForest'] = rf_clf

Best Estimator
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Score
0.860501379393067
Best Params
{'bootstrap': False, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 30, 'min_samples_split': 40, 'min_samples_leaf': 1}

In [None]:
print('Best Estimator')
print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)
print('cv_results_')
print(rf_clf.cv_results_)

###  SVC

In [30]:
from sklearn.svm import SVC

In [31]:
params = {
    'kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'C': [0.025, 0.25, 0.5, 1, 2, 3],
    'gamma': ['auto', 2, 3]
}
          
svc = SVC()
svc_clf = GridSearchCV(svc, params, cv=5)
svc_clf = svc_clf.fit(X, y)

opt_results['SVC'] = svc_clf

In [32]:
print('Best Estimator')
print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)
print('cv_results_')
print(svc_clf.cv_results_)

Best Estimator
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Best Score
0.863260165527168
Best Params
{'C': 1, 'kernel': 'linear', 'gamma': 'auto'}
cv_results_
{'params': [{'C': 0.025, 'kernel': 'linear', 'gamma': 'auto'}, {'C': 0.025, 'kernel': 'poly', 'gamma': 'auto'}, {'C': 0.025, 'kernel': 'rbf', 'gamma': 'auto'}, {'C': 0.025, 'kernel': 'sigmoid', 'gamma': 'auto'}, {'C': 0.025, 'kernel': 'linear', 'gamma': 2}, {'C': 0.025, 'kernel': 'poly', 'gamma': 2}, {'C': 0.025, 'kernel': 'rbf', 'gamma': 2}, {'C': 0.025, 'kernel': 'sigmoid', 'gamma': 2}, {'C': 0.025, 'kernel': 'linear', 'gamma': 3}, {'C': 0.025, 'kernel': 'poly', 'gamma': 3}, {'C': 0.025, 'kernel': 'rbf', 'gamma': 3}, {'C': 0.025, 'kernel': 'sigmoid', 'gamma': 3}, {'C': 0.25, 'kernel': 'linear', 'gamma': 'auto'}, {'C': 0.25, 'kernel': 'poly', 'gamma':

0.8642197433129423
{'C': 2, 'gamma': 'auto', 'kernel': 'linear'} 'C':[0.025, 0.25, 0.5, 1, 2, 3, 5, 8, 10, 15, 20], 
48
<function _passthrough_scorer at 0x2ba712f58950>
5
9.254388332366943

### KNeighborsClassifier

In [33]:
from sklearn.neighbors import KNeighborsClassifier

In [34]:
# p: Power parameter for the Minkowski metric. When p = 1, 
#    this is equivalent to using manhattan_distance (l1), 
#    and euclidean_distance (l2) for p = 2. 
#    For arbitrary p, minkowski_distance (l_p) is used.

params = {
    'n_neighbors': [3, 5, 9, 13, 19, 25, 35, 55, 63], 
    'leaf_size': [20, 30, 40, 50, 60],
    'p': [1, 2, 3]
}
          
knn = KNeighborsClassifier()
knn_clf = GridSearchCV(knn, params, cv=5)
knn_clf = knn_clf.fit(X.todense(), y) # KNN takes dense input in scikit-learn

opt_results['KNeighbors'] = knn_clf

KeyboardInterrupt: 

In [None]:
print('Best Estimator')
print(knn_clf.best_estimator_)
print('Best Score')
print(knn_clf.best_score_)
print('Best Params')
print(knn_clf.best_params_)
print('cv_results_')
print(knn_clf.cv_results_)

### MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
params = {
    'hidden_layer_sizes': [(10,5), (20,10), (20), (30,20), (50,30)], 
    'activation': ['tanh', 'relu', 'logistic'], 
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.01, 0.001, 0.1],
    'max_iter': [50, 200, 400]
}
                        
mlp = MLPClassifier()
mlp_clf = GridSearchCV(mlp, params, cv=5)
mlp_clf = mlp_clf.fit(X, y)

opt_results['MLP'] = mlp_clf

In [None]:
print('Best Estimator')
print(clf.best_estimator_)
print('Best Score')
print(clf.best_score_)
print('Best Params')
print(clf.best_params_)
print('cv_results_')
print(clf.cv_results_)

### Kmeans

# All classifiers