In [1]:
import pandas as pd
import numpy as np

In [2]:
import spacy

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
df = pd.read_excel('Data/20181001-newindianexpress_sentence_classification_adjudicated_20181218.xlsx')

In [5]:
df.head()

Unnamed: 0,url,sent_num,sentence,text,label
0,http://www.newindianexpress.com/states/andhra-...,1-1,Anantapur Lawyers to Revive Samaikya Agitation...,Anantapur Lawyers to Revive Samaikya Agitation...,2.0
1,http://www.newindianexpress.com/states/andhra-...,1-2,A meeting of lawyers was held in the town on S...,Anantapur Lawyers to Revive Samaikya Agitation...,0.0
2,http://www.newindianexpress.com/states/andhra-...,1-3,"Speaking to reporters later, advocates’ leader...",Anantapur Lawyers to Revive Samaikya Agitation...,0.0
3,http://www.newindianexpress.com/states/andhra-...,1-4,“Bifurcation is being done without taking into...,Anantapur Lawyers to Revive Samaikya Agitation...,0.0
4,http://www.newindianexpress.com/states/andhra-...,1-5,The Bill did not contain any proposals for dev...,Anantapur Lawyers to Revive Samaikya Agitation...,0.0


In [6]:
mask = np.logical_not(np.isnan(np.array(df['label'])))

In [7]:
cleandf = df[mask]

In [8]:
sent = cleandf['sentence'][0]

In [9]:
nlp = spacy.load('en')
doc = nlp(sent)

In [10]:
for d in doc:
    print(d.lemma_, d.text)

anantapur Anantapur
lawyer Lawyers
to to
revive Revive
samaikya Samaikya
agitation Agitation
, ,
to to
lay Lay
siege Siege
to to
courts Courts
today Today
27th 27th
january January
2014 2014
09:58 09:58
am AM
renew Renewing
the the
samikyandhra Samikyandhra
agitation agitation
, ,
the the
lawyer lawyers
of of
the the
district district
have have
decide decided
to to
take take
on on
a a
series series
of of
protest protests
start starting
with with
lay laying
siege siege
to to
court courts
on on
monday Monday
. .


## Nlp pipeline

In [11]:
nlp = spacy.load('en')

In [12]:
# http://www.insightsbot.com/blog/R8fu5/bag-of-words-algorithm-in-python-introduction
def extract_words(sentence):
    ignore_words = ['a']
    #words = re.sub("[^\w]", " ",  sentence).split() #nltk.word_tokenize(sentence)
    doc = nlp(sentence)
    words = [token.lemma_ for token in doc]
    words_cleaned = [w.lower().strip() for w in words if w not in ignore_words]
    return words_cleaned  

def tokenize_sentences(sentences):
    words = []
    for sentence in sentences:
        w = extract_words(sentence)
        words.extend(w)
        
    words = sorted(list(set(words)))
    return words

def bagofwords(sentence, words):
    sentence_words = extract_words(sentence)
    # frequency word count
    bag = np.zeros(len(words))
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag)

vocab = tokenize_sentences(df['sentence'])

import pickle
with open('vocabulary.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
vocab = pd.read_pickle("./vocabulary.pickle")

In [17]:
cleandf['sentence'][8]

'State bar Association vice-president A Rami Reddy said that they will organise a mass fast at the Clock Tower in the town on January 30 and asked the Bar association members and lawyers from the district to join them.'

In [18]:
len(bagofwords(cleandf['sentence'][8],vocab))

25904

In [19]:
cleandf = cleandf.reset_index(drop=True)

In [20]:
vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df['sentence'])
tfidf_vectors

<23453x28405 sparse matrix of type '<class 'numpy.float64'>'
	with 494526 stored elements in Compressed Sparse Row format>

In [60]:
ner_tagset = ["PERSON","NORP","FAC","ORG","GPE","LOC","PRODUCT","EVENT","WORK_OF_ART","LAW","LANGUAGE","DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL","CARDINAL"]

In [61]:
def extract_entities(sentence):
    doc = nlp(row['sentence'])
    entities = [token.label_ for token in doc.ents]
    return entities  

def bagofentities(sentence, ner_tagset):
    entities = extract_entities(sentence)
    # frequency word count
    bag = np.zeros(len(ner_tagset))
    for ent in entities:
        for i, entity in enumerate(ner_tagset):
            if ent==entity:
                bag[i] += 1
    return np.array(bag)

For dictionary featurelist
for i,row in cleandf.iterrows():
    feauture_list[i] = np.concatenate((bagofwords(row['sentence'],vocab), bagofentities(row['sentence'], ner_tagset)))

In [91]:
feauture_list = []

In [None]:
for i,row in cleandf.itprintows():
  appendfeauture_list.append(np.concatenate((bagofwords(row['sentence'],vocab), bagofentities(row['sentence'], ner_tagset))))

In [64]:
with open('feauture_list.pickle', 'wb') as handle:
    pickle.dump(feautures_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

feauture_list = pd.read_pickle("./feauture_list.pickle")

## Classifier Comparison

In [95]:
# necessary imports,
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [96]:
# All classifiers
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

In [97]:
X = list(feautures_list.values())
y = np.array(cleandf['label'])

In [90]:
clf = SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=5)

In [93]:
scores

array([0.85440383, 0.85131894, 0.84463107, 0.85962807, 0.87334934])

In [98]:
result_file = open("scikit_learn_results.txt","w") 


In [99]:
for name, classifier in zip(names[:2], classifiers[:2]):
    y_true, y_pred = y, cross_val_predict(classifier, X, y)
    result_file.write('\n---', name, '---')
    result_file.write('Precision:', precision_score(y_true, y_pred, average='weighted'))
    result_file.write('Recall:', recall_score(y_true, y_pred, average='weighted'))
    result_file.write('F1-score:', f1_score(y_true, y_pred, average='weighted'))
    result_file.write('Accuracy:', accuracy_score(y_true, y_pred))



KeyboardInterrupt: 