In [18]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

## Load Data

In [5]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [6]:
df = df[np.logical_not(np.isnan(np.array(df['label'])))]

In [7]:
y = np.array(df['label'])

In [10]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000']

In [19]:
scoring = 'f1_macro'
n_jobs=20


# Feature Extraction

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [43]:
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
       # ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC(kernel='linear', C=3))
        ])

params = {
    'tfidf__max_df':(0.999, 0.98, 0.8, 0.60),
    'tfidf__min_df':(0, 0.00001, 0.00009, 0.0001 ,0.0003),
    'tfidf__stop_words': ('english',None, number_stopwords),
    #'feat_sel__percentile': (10, 90, 100),
}
          
svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(df['sentence'], y)

In [45]:
svc_clf.best_params_
# C=3

0.5818901470181063

In [47]:
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
       # ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC(kernel='linear', C=2))
        ])

params = {
    'tfidf__max_df':(0.999, 0.98, 0.8, 0.60),
    'tfidf__min_df':(0, 0.00009, 0.0001 ,0.0003, 0.0005, 0.0009, 0.001, 0.002),
    'tfidf__stop_words': ('english',None, number_stopwords),
    #'feat_sel__percentile': (10, 90, 100),
}
          
svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(df['sentence'], y)

svc_clf.best_estimator_
# C=2

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.0003,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [48]:
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
       # ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC(kernel='linear', C=5))
        ])

params = {
    'tfidf__max_df':(0.999, 0.98, 0.8, 0.60),
    'tfidf__min_df':(0, 0.00009, 0.0001 ,0.0003, 0.0005, 0.0009, 0.001, 0.002),
    'tfidf__stop_words': ('english',None, number_stopwords),
    #'feat_sel__percentile': (10, 90, 100),
}
          
svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(df['sentence'], y)
svc_clf.best_estimator_
# C=5

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.0001
    max_df: 0.6
    stop_words: num_stopwords

In [50]:
vectorizer = TfidfVectorizer(min_df=0.0001, max_df=0.6, stop_words=number_stopwords)
tfidf_vectors = vectorizer.fit_transform(df['sentence'])

### TAG Features

In [52]:
import spacy

In [53]:
nlp = spacy.load('en')

In [54]:
ner_tagset = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
              'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
              'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']

pos_tagset = ['-LRB-', '-LRB-', ',', ':', '\'\'', '""', '#', '``', '$', 
              'ADD', 'AFX', 'BES', 'CC', 'CD', 'DT', 'EX', 'FW', 'GW',
              'HVS', 'HYPH', 'IN',
              'JJ', 'JJR', 'JJS', 'LS', 'MD',
              'NFP', 'NIL', 'NN', 'NNS', 'NNP', 'NNPS',
              'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', '_SP',
              'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 
              'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX']

In [55]:
def bag_of_tags(sentence, ner_tagset, pos_tagset):
    enriched_sentence = nlp(sentence)
    pos_tags = [token.tag_ for token in enriched_sentence if not token.is_stop]
    entities = [token.label_ for token in enriched_sentence.ents]
    
    
    # frequency word count
    ner_bag = np.zeros(len(ner_tagset))
    for ent in entities:
        for i, entity in enumerate(ner_tagset):
            if ent==entity:
                ner_bag[i] += 1
                
    pos_bag = np.zeros(len(pos_tagset))            
    for pos in pos_tags:
        for i, postag in enumerate(pos_tagset):
            if pos==postag:
                pos_bag[i] += 1            
    
    return np.concatenate((np.array(ner_bag), np.array(pos_bag)))

In [56]:
tag_features = []
for i,row in df.iterrows():
    tag_features.append(bag_of_tags(row['sentence'], ner_tagset, pos_tagset))

### Combining TF-IDF Vectors and Named Entity Features

In [57]:
from scipy.sparse import hstack

In [58]:
X = hstack((tfidf_vectors, np.array(tag_features)))

#### Saving Feature Vectors

In [61]:
import pickle

In [62]:
feature_path = 'Data/features_mindf_0001_maxdf_6_number_stopwords_pos_ner.pickle'

In [63]:
with open(feature_path, 'wb') as file_:
    pickle.dump(X, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
X = pd.read_pickle(feature_path)

#### Memory Cleaning

In [66]:
del tfidf_vectors
del df
del tag_features
del nlp

# Classifier Training
- With hyper-parameter optimization

https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin

In [67]:
opt_results = {}
opt_results_path = 'Results/optimization_results_tfidf_ner_pos_IPYNB.pickle'

#### Important Note on the Scoring of Parameter Optimization
GridSearchCV uses mean accuracy by default. However, we have chosen "f1_macro" scoring for hyper-parameter optimization, because mean accuracy or f1_micro is measuring the performance on the total labels, disregarding the type of the label. So, when we use accuracy or f1_micro, we get high scores because most of the labels are 0 and classifier predicts most of the labels as 0. When the accuracy score for the label 0 is high, the overall result becomes high as well, eventhough the other labels perform low. And this kind of high score doesn't mean our classifier performs better because we are actually interested in getting high scores on label 1 and 2.


### Decision Tree

In [69]:
from sklearn.tree import DecisionTreeClassifier

In [72]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', DecisionTreeClassifier(criterion='gini'))
        ])

In [73]:
# Decision Tree
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__max_depth': [None] + [*range(15, 35, 5)],
    'clf__min_samples_split': [*range(50, 200, 40)],
    'clf__min_samples_leaf': [*range(5, 14, 2)],
    'clf__max_features': [None, 'sqrt', 'log2']
}

dt_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
dt_clf = dt_clf.fit(X, y)

In [74]:
print('Best Estimator')
print(dt_clf.best_estimator_)
print('Best Score')
print(dt_clf.best_score_)
print('Best Params')
print(dt_clf.best_params_)

Best Estimator
Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=100, score_func=<function chi2 at 0x2b880eab66a8>)), ('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=30,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=50,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])
Best Score
0.5145538309167568
Best Params
{'clf__min_samples_split': 50, 'clf__max_depth': 30, 'feat_sel__percentile': 100, 'clf__min_samples_leaf': 5, 'clf__max_features': None}


In [75]:
y_true, y_pred = y, cross_val_predict(dt_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [76]:
opt_results['DecisionTree'] = {}
opt_results['DecisionTree']['GridSearchCV'] = dt_clf
opt_results['DecisionTree']['classif_report'] = classification_report(y_true, y_pred)

In [77]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.94      0.92      6876
         1.0       0.55      0.43      0.49      1299
         2.0       0.47      0.09      0.15       162

   micro avg       0.85      0.85      0.85      8337
   macro avg       0.64      0.49      0.52      8337
weighted avg       0.83      0.85      0.83      8337



In [78]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

0.8481468154012235
{'max_depth': 20, 'min_samples_split': 100, 'max_features': None, 'min_samples_leaf': 9}
699
<function _passthrough_scorer at 0x2ba712f58950>
5
0.30017995834350586

### RandomForestClassifier

In [79]:
from sklearn.ensemble import RandomForestClassifier

In [80]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', RandomForestClassifier(criterion='gini'))
        ])

In [83]:
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__n_estimators': [30, 70, 100, 150], 
    'clf__max_depth': [None] + [*range(65, 120, 15)], 
    'clf__min_samples_split': [25, 30, 40, 45, 50, 100],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__bootstrap': [True, False]
}

rf_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
rf_clf = rf_clf.fit(X, y)

In [84]:
print('Best Estimator')
print(rf_clf.best_estimator_)
print('Best Score')
print(rf_clf.best_score_)
print('Best Params')
print(rf_clf.best_params_)

Best Estimator
Pipeline(memory=None,
     steps=[('feat_sel', SelectPercentile(percentile=10, score_func=<function chi2 at 0x2b880eab66a8>)), ('clf', RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=110, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_imp...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])
Best Score
0.44045504385156103
Best Params
{'clf__min_samples_split': 40, 'clf__bootstrap': False, 'clf__max_depth': 110, 'clf__n_estimators': 30, 'feat_sel__percentile': 10, 'clf__max_features': 'sqrt'}


In [85]:
y_true, y_pred = y, cross_val_predict(rf_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [86]:
opt_results['RandomForest'] = {}
opt_results['RandomForest']['GridSearchCV'] = rf_clf
opt_results['RandomForest']['classif_report'] = classification_report(y_true, y_pred)

  'precision', 'predicted', average, warn_for)


In [87]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.87      0.99      0.92      6876
         1.0       0.70      0.27      0.39      1299
         2.0       0.00      0.00      0.00       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.52      0.42      0.44      8337
weighted avg       0.82      0.86      0.82      8337



In [88]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

Best Estimator
RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=80, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=40,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Best Score
0.860501379393067
Best Params
{'bootstrap': False, 'max_features': 'auto', 'max_depth': 80, 'n_estimators': 30, 'min_samples_split': 40, 'min_samples_leaf': 1}

###  SVC

In [89]:
from sklearn.svm import SVC

In [90]:
classifier = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC())
        ])

In [None]:
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'clf__C': [0.025, 0.25, 0.5, 1, 2, 3],
}
          
svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring)
svc_clf = svc_clf.fit(X, y)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
print('Best Estimator')
print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

In [None]:
y_true, y_pred = y, cross_val_predict(svc_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [None]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

0.8642197433129423
{'C': 2, 'gamma': 'auto', 'kernel': 'linear'} 'C':[0.025, 0.25, 0.5, 1, 2, 3, 5, 8, 10, 15, 20], 
48
<function _passthrough_scorer at 0x2ba712f58950>
5
9.254388332366943

### KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
classifier_knn = Pipeline([
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', KNeighborsClassifier())
        ])


In [None]:
params = {
    'feat_sel__percentile': (10, 90, 100),
    'clf__n_neighbors': [3, 5, 9, 13, 19, 25, 35, 55, 63], 
    'clf__leaf_size': [20, 30, 40, 50, 60],
    'clf__p': [1, 2, 3]
}
          
knn_clf = GridSearchCV(classifier_knn, params, cv=5, scoring=scoring, n_jobs=n_jobs)
knn_clf = knn_clf.fit(X.todense(), y)



In [None]:
print('Best Estimator')
print(knn_clf.best_estimator_)
print('Best Score')
print(knn_clf.best_score_)
print('Best Params')
print(knn_clf.best_params_)


In [None]:
y_true, y_pred = y, cross_val_predict(knn_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [106]:
opt_results.keys()

dict_keys(['SVC', 'KNeighbors', 'DecisionTree', 'RandomForest'])

In [None]:
burda kaldı

In [103]:
opt_results['KNeighbors'] = {}
opt_results['KNeighbors']['GridSearchCV'] = knn_clf
opt_results['KNeighbors']['classif_report'] = classification_report(y_true, y_pred)

In [104]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92      6876
         1.0       0.62      0.47      0.53      1299
         2.0       0.54      0.12      0.20       162

   micro avg       0.86      0.86      0.86      8337
   macro avg       0.69      0.52      0.55      8337
weighted avg       0.85      0.86      0.85      8337



In [105]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### MLPClassifier

In [112]:
from sklearn.neural_network import MLPClassifier

In [115]:
classifier = Pipeline([
      #  ('tfidf', TfidfVectorizer()),
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', MLPClassifier())
        ])

In [None]:
params = {
  #  'tfidf__max_df':(0.999, 0.60),
  #  'tfidf__min_df':(0.0009, 0.001, 0.003),
  #  'tfidf__stop_words': (None, number_stopwords),
    'feat_sel__percentile': (10, 90, 100),
    'clf__hidden_layer_sizes': [(10,5), (20,10), (20), (30,20), (50,30)], 
    'clf__activation': ['tanh', 'relu', 'logistic'], 
    'clf__learning_rate': ['constant', 'invscaling', 'adaptive'],
    'clf__learning_rate_init': [0.01, 0.001, 0.1],
    'clf__max_iter': [50, 200, 400]
}
                        
mlp_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
mlp_clf = mlp_clf.fit(X, y)

In [None]:
print('Best Estimator')
print(clf.best_estimator_)
print('Best Score')
print(clf.best_score_)
print('Best Params')
print(clf.best_params_)

In [None]:
y_true, y_pred = y, cross_val_predict(mlp_clf.best_estimator_, X, y, n_jobs=n_jobs, cv=5)

In [None]:
opt_results['MLP'] = {}
opt_results['MLP']['GridSearchCV'] = mlp_clf
opt_results['MLP']['classif_report'] = classification_report(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

### Kmeans

# All classifiers