In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report

In [4]:
import spacy

## Load Data

In [81]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [82]:
df = df[np.logical_not(np.isnan(np.array(df['label'])))]

In [83]:
for i, row in df.iterrows():
    row['sentence'] = re.sub("http\S*\s", "", row['sentence'])
    row['sentence'] = re.sub("((url)*\s*:\s*)*http\S*", "", row['sentence'])
    
    if row['sentence'] == "":
        df=df.drop([i])
        print('#####', i)
        

##### 13
##### 28
##### 41
##### 68
##### 107
##### 113
##### 122
##### 133
##### 143
##### 160
##### 179
##### 195
##### 210
##### 225
##### 233
##### 240
##### 252
##### 263
##### 273
##### 296
##### 319
##### 346
##### 354
##### 376
##### 390
##### 415
##### 424
##### 439
##### 477
##### 526
##### 544
##### 564
##### 571
##### 585
##### 601
##### 610
##### 628
##### 635
##### 685
##### 700
##### 711
##### 723
##### 733
##### 747
##### 753
##### 769
##### 773
##### 812
##### 822
##### 845
##### 854
##### 868
##### 908
##### 919
##### 933
##### 963
##### 978
##### 987
##### 1005
##### 1013
##### 1020
##### 1040
##### 1063
##### 1072
##### 1103
##### 1118
##### 1133
##### 1138
##### 1153
##### 1181
##### 1196
##### 1201
##### 1211
##### 1217
##### 1230
##### 1246
##### 1262
##### 1272
##### 1279
##### 1304
##### 1318
##### 1322
##### 1335
##### 1342
##### 1357
##### 1368
##### 1436
##### 1453
##### 1468
##### 1475
##### 1489
##### 1499
##### 1514
##### 1539
##### 1555
##### 1575
##### 

In [84]:
df1_sents= df['sentence']
df1_y= df['label']

In [85]:
df2 = pd.read_excel('Data/20181126_NewIndianExpress_3Classifier0_Baglan-Ezgi-Balacan-Eylem-Merged_AgreedByAll.xlsx')

In [86]:
df2 = df2[np.logical_not(np.isnan(np.array(df2['label'])))]

In [87]:
df2 = df2[(np.array(df2['label'] == 0))]

In [88]:
nlp = spacy.load('en')

In [89]:
df2_sents = []
df2_labels = []
for i, row in df2.iterrows():
    row['text'] = row['text'].replace('\n', ' ')
    row['text'] = row['text'].replace('&amp;', 'and')
    row['text'] = row['text'].replace('\xa0', ' ')
    row['text'] = row['text'].replace('-', '')
    row['text'] = row['text'].replace("\'", "'")
    row['text'] = re.sub("((url)*\s*:\s*)*http\S*", "", row['text'])
    row['text'] = row['text'].replace('  ', ' ')
    doc = nlp(row['text'])
    for sent in doc.sents:
        df2_sents.append(sent.text) 
        df2_labels.append(0)


In [90]:
type(df2_sents)

list

In [91]:
ordinals = ['1st', '2nd', '3rd', '4th']
for i in range(5,31):
    ordinals.append(str(i) + 'th')
    ordinals.append('0' + str(i) + 'th') 
    

In [92]:
number_stopwords = [str(i) for i in range(10001)] + ['0'+str(i) for i in range(100)] + ['000'] + ordinals

In [93]:
scoring = 'f1_macro'
n_jobs=10


In [94]:
sents = df1_sents.tolist() + df2_sents
y = df1_y.tolist() + df2_labels

# Feature Extraction

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

We can see that best TFIDFVectorizer features are as listed below;
    min_df: 0.0001
    max_df: 0.6
    stop_words: num_stopwords

In [96]:
heidel_folder_path =  'Heidel/heidel.txt'

In [99]:
len(sents)

15997

In [108]:
heidel_folder_paths = {}

In [113]:
for i in range(0,16):
    heidel_folder_paths[i] = 'Heidel/heidel_' + str(i) + '.txt'

In [114]:
heidel_folder_paths

{0: 'Heidel/heidel_0.txt',
 1: 'Heidel/heidel_1.txt',
 2: 'Heidel/heidel_2.txt',
 3: 'Heidel/heidel_3.txt',
 4: 'Heidel/heidel_4.txt',
 5: 'Heidel/heidel_5.txt',
 6: 'Heidel/heidel_6.txt',
 7: 'Heidel/heidel_7.txt',
 8: 'Heidel/heidel_8.txt',
 9: 'Heidel/heidel_9.txt',
 10: 'Heidel/heidel_10.txt',
 11: 'Heidel/heidel_11.txt',
 12: 'Heidel/heidel_12.txt',
 13: 'Heidel/heidel_13.txt',
 14: 'Heidel/heidel_14.txt',
 15: 'Heidel/heidel_15.txt'}

In [107]:
[i for i in range(0,16)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [116]:
for i in range(0,16):
    if i == 15:
        print(heidel_folder_paths[i])
        print(i*1000)
        print(len(sents))
        print()
    else:
        print(heidel_folder_paths[i])
        print(i*1000)
        print(1000+i*1000)
        print()

Heidel/heidel_0.txt
0
1000

Heidel/heidel_1.txt
1000
2000

Heidel/heidel_2.txt
2000
3000

Heidel/heidel_3.txt
3000
4000

Heidel/heidel_4.txt
4000
5000

Heidel/heidel_5.txt
5000
6000

Heidel/heidel_6.txt
6000
7000

Heidel/heidel_7.txt
7000
8000

Heidel/heidel_8.txt
8000
9000

Heidel/heidel_9.txt
9000
10000

Heidel/heidel_10.txt
10000
11000

Heidel/heidel_11.txt
11000
12000

Heidel/heidel_12.txt
12000
13000

Heidel/heidel_13.txt
13000
14000

Heidel/heidel_14.txt
14000
15000

Heidel/heidel_15.txt
15000
15997



In [117]:
for i in range(0,16):
    if i == 15:
        with open(heidel_folder_paths[i], 'w') as txt_file:
            for sentence in sents[i*1000:len(sents)]:
                txt_file.write(sentence + "\n")
    else:
        with open(heidel_folder_paths[i], 'w') as txt_file:
            for sentence in sents[i*1000:1000+i*1000]:
                txt_file.write(sentence + "\n")

# Classifier Training
- With hyper-parameter optimization

with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)



In [75]:
opt_results = {}
opt_results_path = 'Results/opt_results_new_feats.pickle'

In [76]:
from sklearn.svm import SVC

In [None]:
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('feat_sel', SelectPercentile(score_func=chi2)),
        ('clf', SVC())
        ])

params = {
    'tfidf__max_df':(0.999, 0.8, 0.60),
    'tfidf__min_df':(0.0009, 0.001, 0.003),
    'tfidf__stop_words': (None, number_stopwords),
    'feat_sel__percentile': (10, 90, 100),
    'clf__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
    'clf__C': [0.025, 0.25, 0.5, 1, 2, 3],
}

svc_clf = GridSearchCV(classifier, params, cv=5, scoring=scoring, n_jobs=n_jobs)
svc_clf = svc_clf.fit(sents, y)

print('Best Estimator')
print(svc_clf.best_estimator_)
print('Best Score')
print(svc_clf.best_score_)
print('Best Params')
print(svc_clf.best_params_)

In [None]:
y_true, y_pred = y, cross_val_predict(svc_clf.best_estimator_, sents, y, n_jobs=n_jobs, cv=5)

In [None]:
opt_results['SVC'] = {}
opt_results['SVC']['GridSearchCV'] = svc_clf
opt_results['SVC']['classif_report'] = classification_report(y_true, y_pred)

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
with open(opt_results_path, 'wb') as file_:
    pickle.dump(opt_results, file_, protocol=pickle.HIGHEST_PROTOCOL)

In [94]:
rf_clf = opt_results['RandomForest']['GridSearchCV']

In [95]:
best_parameters['steps']

[('tfidf',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=0.8, max_features=None, min_df=0.001,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49'...', '087', '088', '089', '090', '091', '092', '093', '094', '095', '096', '097', '098', '099', '000'],
          strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('feat_sel',
  SelectPercentile(percentile=10, score_func=<function chi2 at 0x2baad0f1e510>)),
 ('clf',
  RandomForestClass

In [96]:
classifier_annot = Pipeline(best_parameters['steps'])
classifier_annot.fit(sents, y)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.001,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [98]:
classifier_annot

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=0.001,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [99]:
classifier_ = classifier_annot
#classifier_ = rf_clf.best_estimator_

In [102]:
rf.feature_importances_

array([0.00076621, 0.00062105, 0.00436315, 0.00163398, 0.00813422,
       0.00034185, 0.00260463, 0.01963313, 0.00294293, 0.00106912,
       0.00327247, 0.00679953, 0.00104059, 0.00208177, 0.00276737,
       0.00243369, 0.00184638, 0.00276646, 0.00129392, 0.00046192,
       0.01172079, 0.0296334 , 0.00946922, 0.01328174, 0.00038002,
       0.00616665, 0.00083558, 0.0010122 , 0.00192614, 0.00485616,
       0.00183375, 0.00053834, 0.01337948, 0.00980543, 0.00170072,
       0.00081354, 0.00517804, 0.00257506, 0.00369815, 0.00192206,
       0.00304843, 0.00101992, 0.00188035, 0.00148576, 0.00234951,
       0.00138303, 0.00181553, 0.00505528, 0.00064562, 0.00082124,
       0.00315405, 0.00598033, 0.00932815, 0.00090597, 0.0064534 ,
       0.00038646, 0.00341692, 0.00171774, 0.00542036, 0.00039004,
       0.00359552, 0.00228106, 0.00245555, 0.00847046, 0.00679797,
       0.00150084, 0.0013536 , 0.00579102, 0.00945547, 0.0009135 ,
       0.00066239, 0.00011416, 0.0010631 , 0.01325041, 0.00012

In [101]:
map(lambda x: round(x, 4), rf.feature_importances_)

<map at 0x2baadfdd7400>

In [105]:
rf = classifier_.get_params()['clf']
vectorizer = classifier_.get_params()['tfidf']
feature_names = vectorizer.get_feature_names()
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.05976619423242239, 'appealed'),
 (0.041789398970313356, 'assam'),
 (0.02963340297339374, '22nd'),
 (0.02790380263523603, 'all'),
 (0.024070875331055466, 'approached'),
 (0.023717870075104175, 'ahead'),
 (0.023159768508580286, 'approved'),
 (0.01963312596156911, '08th'),
 (0.01738233204773876, 'aiadmk'),
 (0.016062056562725257, 'apex'),
 (0.01337948499247388, 'abducted'),
 (0.013281736224749745, '24th'),
 (0.01325040926586392, 'adjournment'),
 (0.012788993631530228, 'authority'),
 (0.01220614198221545, 'are'),
 (0.011720791297551165, '21st'),
 (0.011095640031637597, 'allahabad'),
 (0.010496737656991478, 'article'),
 (0.010059464144808248, 'arrested'),
 (0.010015383582743157, 'babri'),
 (0.009805430723119393, 'abdul'),
 (0.009768830143929956, 'administrative'),
 (0.009478315816240812, 'allotted'),
 (0.00946922170867094, '23rd'),
 (0.00945547095373403, 'address'),
 (0.009328151138260757, 'acquisition'),
 (0.008716398956452041, 'appointed'),
 (0.008470457414195127, 'acts'),
 (0.0082453

In [146]:
def show_most_informative_features(vectorizer, clf, n=300):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        #print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        print(coef_1, fn_1, coef_2, fn_2)

In [129]:
with open(opt_results_path, 'rb') as file_:
    opt_results = pickle.load(file_)

svc_clf = opt_results['SVC']['GridSearchCV']



In [136]:
best_parameters_SVC = opt_results['SVC']['GridSearchCV'].best_estimator_.get_params()

In [138]:
best_parameters_SVC['steps']

[('tfidf',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=0.6, max_features=None, min_df=0.003,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=None, strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('feat_sel',
  SelectPercentile(percentile=100, score_func=<function chi2 at 0x2baad0f1e510>)),
 ('clf', SVC(C=3, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))]

In [139]:
classifier_annot_SVC = Pipeline(best_parameters_SVC['steps'])
classifier_annot_SVC.fit(sents, y)


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.6, max_features=None, min_df=0.003,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=T...r', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [150]:
feature_names = classifier_annot_SVC.get_params()['tfidf'].get_feature_names()
feature_names

['000',
 '01',
 '02',
 '02nd',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '19th',
 '20',
 '200',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '40',
 '45',
 '50',
 'about',
 'according',
 'accused',
 'across',
 'act',
 'action',
 'activists',
 'activities',
 'added',
 'adding',
 'additional',
 'addressing',
 'adjourned',
 'administration',
 'advani',
 'affairs',
 'affected',
 'afghan',
 'afghanistan',
 'after',
 'afternoon',
 'again',
 'against',
 'agency',
 'agitation',
 'ago',
 'agreed',
 'agreement',
 'ahead',
 'ahmed',
 'aiadmk',
 'air',
 'all',
 'allegations',
 'alleged',
 'allegedly',
 'alliance',
 'allowed',
 'along',
 'already',
 'also',
 'am',
 'amid',
 'among',
 'an',
 'and',
 'andhra',
 'anna',
 'announced',
 'another',
 'answer',
 'anti',
 'any',
 'ap',
 'apr',
 'april',
 'are',
 'area',


In [158]:
print(classifier_annot_SVC.get_params()['clf'].coef_)


  (0, 117)	-0.8402562780827214
  (0, 395)	-0.5419500958567038
  (0, 426)	-1.193128781835611
  (0, 495)	-0.5534364325246361
  (0, 540)	0.9681601932880497
  (0, 543)	-0.3200732094044696
  (0, 609)	-0.4369897935547442
  (0, 617)	-0.6646679931300241
  (0, 697)	-5.21351264679646
  (0, 840)	-0.7557923155878967
  (0, 891)	-0.6157461459508529
  (0, 918)	-0.3981021738203747
  (0, 933)	-0.1272065987361799
  (0, 942)	0.18451290113836194
  (0, 949)	-0.17259295634794825
  (0, 43)	-0.18093304339636163
  (0, 52)	0.618592685153807
  (0, 87)	0.07846646212033903
  (0, 119)	-1.2956449305839328
  (0, 155)	0.24838121371882416
  (0, 364)	1.3330358189280833
  (0, 460)	-1.1073076955998609
  (0, 635)	-0.888709688048074
  (0, 884)	0.4289145959290295
  (0, 955)	0.788505685000569
  :	:
  (2, 588)	0.36923460389005697
  (2, 912)	0.36923460389005697
  (2, 968)	0.3690627960813366
  (2, 396)	0.22899456664320028
  (2, 297)	-0.510106978705692
  (2, 853)	0.3065948525572051
  (2, 216)	0.4283617354449483
  (2, 895)	-0.5678

In [None]:
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))

In [147]:
show_most_informative_features(classifier_annot_SVC.get_params()['tfidf'], classifier_annot_SVC.get_params()['clf'])

  (0, 117)	-0.8402562780827214
  (0, 395)	-0.5419500958567038
  (0, 426)	-1.193128781835611
  (0, 495)	-0.5534364325246361
  (0, 540)	0.9681601932880497
  (0, 543)	-0.3200732094044696
  (0, 609)	-0.4369897935547442
  (0, 617)	-0.6646679931300241
  (0, 697)	-5.21351264679646
  (0, 840)	-0.7557923155878967
  (0, 891)	-0.6157461459508529
  (0, 918)	-0.3981021738203747
  (0, 933)	-0.1272065987361799
  (0, 942)	0.18451290113836194
  (0, 949)	-0.17259295634794825
  (0, 43)	-0.18093304339636163
  (0, 52)	0.618592685153807
  (0, 87)	0.07846646212033903
  (0, 119)	-1.2956449305839328
  (0, 155)	0.24838121371882416
  (0, 364)	1.3330358189280833
  (0, 460)	-1.1073076955998609
  (0, 635)	-0.888709688048074
  (0, 884)	0.4289145959290295
  (0, 955)	0.788505685000569
  :	:
  (0, 641)	-0.44548361070642795
  (0, 256)	-1.6544834788693112
  (0, 514)	0.2840366060374118
  (0, 586)	-0.024931187966646062
  (0, 706)	-0.021170782591918957
  (0, 861)	1.356665325610023
  (0, 353)	0.5307827496571951
  (0, 235)	0.