In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics,naive_bayes

nlp = spacy.load("en_core_web_sm")

### Reading the training data and validation data

In [2]:
train_data_file = 'train-data-prepared.json'
val_data_file = 'val-data-prepared.json'

df_train = pd.read_json(train_data_file)
df_val = pd.read_json(val_data_file)
# display(df_train)

### Function: Pre-process the text(lower casing, lead/train space stripping,remove punctuations)

In [3]:
def PreprocessData(df):
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].str.strip()
    df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
    return df


### Function: Linguistic Feature Extraction

In [4]:
def FeatureExtract(df):
    
    df['token_count'] = df['clean_text'].apply(lambda x: len(nlp(x)))

    md_exist_list = [] # modal verb
    verb_count_list = [] # verb
    prp_exist_list = [] # personal pronouns
    vbp_exist_list = [] # verb non 3rd person
    adv_exist_list = [] # adverb

    for text in df['clean_text']:
        doc = nlp(text)
        md_exist = 0
        verb_count = 0
        prp_exist = 0
        vbp_exist = 0
        adv_exist = 0

        for token in doc:
            if token.tag_ == 'MD':
                md_exist = 1

            if token.pos_ == 'VERB':
                verb_count = 1

            if token.tag_ == 'PRP':
                prp_exist = 1

            if token.tag_ == 'VBP':
                vbp_exist = 1

            if token.pos_ == 'ADV':
                adv_exist = 1

        md_exist_list.append(md_exist)
        verb_count_list.append(verb_count)
        prp_exist_list.append(prp_exist)
        vbp_exist_list.append(vbp_exist)
        adv_exist_list.append(adv_exist)

    df['modal_exist'] = md_exist_list
    df['verb_count'] = verb_count_list
    df['prp_exist'] = prp_exist_list
    df['vbp_exist'] = vbp_exist_list
    df['adv_exist'] = adv_exist_list
    return df

### Preprocess and feature extraction: Train and Val set

In [5]:
df_train = PreprocessData(df_train)
df_train = FeatureExtract(df_train)

df_val = PreprocessData(df_val)
df_val = FeatureExtract(df_val)
display(df_train)
# print("next \n")
display(df_val)

Unnamed: 0,id,text,label,clean_text,token_count,modal_exist,verb_count,prp_exist,vbp_exist,adv_exist
0,1363ac6f16e232a1d8e9343d975ebe10,Since communism has been relegated to just a h...,0,since communism has been relegated to just a h...,11,0,1,0,0,1
1,6b925ce5eeb8b690b35972abafcb7c60,Can you counter that?,0,can you counter that,4,1,1,1,0,0
2,99977c6e63734add1c1b600be79b3342,Censorship does not eliminate the censored ind...,0,censorship does not eliminate the censored ind...,7,0,1,0,0,1
3,7e34d9e198bc9e12f0868793c68d32f0,"Without the extra population from abortions, h...",0,without the extra population from abortions ha...,38,0,1,0,1,1
4,629d09668c3339dba831f6c81a307b0e,I can't stand it,1,i cant stand it,5,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
2614,452684ca088e4ab1c58d89e1a28e1ef7,"it's ""true"" or not and that ""truth"" is availab...",0,its true or not and that truth is available to...,15,0,1,1,0,1
2615,b5fa34bde09f97ab565ec1ae433d1797,And these slogans don't even denote any sense ...,0,and these slogans dont even denote any sense o...,22,0,1,0,1,1
2616,f676671baa396678dcb3471ea67e70ed,&gt;whole-bodyWhile,0,gtwholebodywhile,1,0,0,0,0,0
2617,a4e0fa2814bb40ae76750ea1597084de,that the majority of them are affected negativ...,1,that the majority of them are affected negativ...,12,0,1,1,1,1


Unnamed: 0,id,text,label,clean_text,token_count,modal_exist,verb_count,prp_exist,vbp_exist,adv_exist
0,aecfe3235c24d43c6f95ce59a16841c0,changing the US laws would not change internat...,1,changing the us laws would not change internat...,9,1,1,1,0,1
1,02c28fbc7ae79544ed9ae2e232627dd5,There will always be evil people that bring mi...,1,there will always be evil people that bring mi...,12,1,1,0,1,1
2,059e6da466032dd7de62d60443b62d02,Remember reasonable people actually voted Hitl...,0,remember reasonable people actually voted hitl...,8,0,1,0,0,1
3,6435772802042af0ee18a1b80ce727a7,", but did",0,but did,3,0,1,0,0,0
4,65595ce73b97078ef46c9d1dcb905a5c,", which",0,which,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
344,9614a90a4ab411f3765ec50befa9cef5,Land before Time has about 15 other movies in ...,1,land before time has about 15 other movies in ...,19,0,1,1,1,1
345,b49f77aa9914dbb7ae444b06472714c6,Look around at classes.,0,look around at classes,4,0,1,0,0,1
346,0cb29c202cfb23cebdfed2a62ccd2ba8,"With absolutely anything, there will always be...",0,with absolutely anything there will always be ...,16,1,1,0,1,1
347,75b3e187dda4c59b0380bc7d1056dc20,Perhaps my family (and quite a few other famil...,0,perhaps my family and quite a few other famili...,17,0,1,0,1,1


In [6]:
# this code is for analying the features. Uncomment the second line to test different features

# display(df_train)
# df_train.loc[(df_train['adv_exist'] != 0) & (df_train['label'] == 1)]

In [7]:
# this part contains junk code for testing

text = "him personally believe had would feel death penalty should be abolished plays"
doc1 = nlp(text)
md_count = 0
for token in doc1:
    print(token.pos_, token.tag_)
    if token.tag_ == 'MD':
        md_count = md_count + 1

print(md_count)

spacy.explain('VBG')

# junk code ends here

PRON PRP
ADV RB
VERB VBP
VERB VBD
VERB MD
VERB VB
NOUN NN
NOUN NN
VERB MD
VERB VB
VERB VBN
NOUN NNS
2


'verb, gerund or present participle'

### Feature Extraction using Bag of Words(uni and bigrams): Train and Val set

In [8]:
vectorizer = CountVectorizer(ngram_range=(1,2))
feature_matrix_bow = vectorizer.fit_transform(df_train['clean_text'])


df_features_bow = pd.DataFrame(feature_matrix_bow.toarray())
# df_features_text = df_train[["token_count","modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features_text = df_train[["modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features = pd.concat([df_features_bow,df_features_text],axis = 1)

x_train = df_features.values
y_train = df_train['label'].values

# display(df_features)
# print(feature_matrix_bow.shape)
print(x_train.shape)
print(y_train.shape)

(2619, 28029)
(2619,)


In [9]:
feature_matrix_bow_val = vectorizer.transform(df_val['clean_text'])

df_features_bow_val = pd.DataFrame(feature_matrix_bow_val.toarray())
# df_features_text_val = df_val[["token_count","modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features_text_val = df_val[["modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features_val = pd.concat([df_features_bow_val,df_features_text_val],axis = 1)

x_val = df_features_val.values
y_val = df_val['label'].values
print(x_val.shape)
print(y_val.shape)

(349, 28029)
(349,)


### Classifer training and predict: Logisitc Regression

In [10]:
classifier = LogisticRegression(random_state=0, max_iter = 1000)
classifier.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
predictions = classifier.predict(x_val)

In [18]:
metrics.f1_score(y_val,predictions)

0.4666666666666666

In [13]:
print(predictions)

[0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0
 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1
 1 1 0 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1
 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0
 1 0 0 0 1 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1
 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0
 1 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0]


In [14]:
print(y_val)

[1 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1
 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1
 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 0
 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0
 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1
 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 1 1
 1 0 1 0 1 0 1 0 0 1 1 1 0 0 0 0]
