In [33]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
import re
from collections import Counter

### Reading the training data and pre-processing the text(lower casing, lead/train space stripping,remove punctuations)

In [59]:
train_data_file = 'train-data-prepared.json'
df_train = pd.read_json(train_data_file)
# display(df_train)

In [60]:
df_train['clean_text'] = df_train['text'].str.lower()
df_train['clean_text'] = df_train['clean_text'].str.strip()
df_train['clean_text'] = df_train['clean_text'].str.replace('[^\w\s]','')
display(df_train)

Unnamed: 0,id,text,label,clean_text
0,1363ac6f16e232a1d8e9343d975ebe10,Since communism has been relegated to just a h...,0,since communism has been relegated to just a h...
1,6b925ce5eeb8b690b35972abafcb7c60,Can you counter that?,0,can you counter that
2,99977c6e63734add1c1b600be79b3342,Censorship does not eliminate the censored ind...,0,censorship does not eliminate the censored ind...
3,7e34d9e198bc9e12f0868793c68d32f0,"Without the extra population from abortions, h...",0,without the extra population from abortions ha...
4,629d09668c3339dba831f6c81a307b0e,I can't stand it,1,i cant stand it
...,...,...,...,...
2614,452684ca088e4ab1c58d89e1a28e1ef7,"it's ""true"" or not and that ""truth"" is availab...",0,its true or not and that truth is available to...
2615,b5fa34bde09f97ab565ec1ae433d1797,And these slogans don't even denote any sense ...,0,and these slogans dont even denote any sense o...
2616,f676671baa396678dcb3471ea67e70ed,&gt;whole-bodyWhile,0,gtwholebodywhile
2617,a4e0fa2814bb40ae76750ea1597084de,that the majority of them are affected negativ...,1,that the majority of them are affected negativ...


In [61]:
nlp = spacy.load("en_core_web_sm")

### Feature Extraction

In [62]:
# Extracting token count and adding it as a DF column

df_train['token_count'] = df_train['clean_text'].apply(lambda x: len(nlp(x)))

md_exist_list = [] # modal verb
verb_count_list = [] # verb
prp_exist_list = [] # personal pronouns
vbp_exist_list = [] # verb non 3rd person
adv_exist_list = [] # adverb

for text in df_train['clean_text']:
    doc = nlp(text)
    md_exist = 0
    verb_count = 0
    prp_exist = 0
    vbp_exist = 0
    adv_exist = 0
    
    for token in doc:
        if token.tag_ == 'MD':
            md_exist = 1
        
        if token.pos_ == 'VERB':
            verb_count = verb_count + 1
        
        if token.tag_ == 'PRP':
            prp_exist = 1
            
        if token.tag_ == 'VBP':
            vbp_exist = 1
        
        if token.pos_ == 'ADV':
            adv_exist = 1
            
    md_exist_list.append(md_exist)
    verb_count_list.append(verb_count)
    prp_exist_list.append(prp_exist)
    vbp_exist_list.append(vbp_exist)
    adv_exist_list.append(adv_exist)

df_train['modal_exist'] = md_exist_list
df_train['verb_count'] = verb_count_list
df_train['prp_exist'] = prp_exist_list
df_train['vbp_exist'] = vbp_exist_list
df_train['adv_exist'] = adv_exist_list

In [66]:
# this code is for analying the features. Uncomment the second line to test different features

display(df_train)
# df_train.loc[(df_train['adv_exist'] != 0) & (df_train['label'] == 1)]

Unnamed: 0,id,text,label,clean_text,token_count,modal_exist,verb_count,prp_exist,vbp_exist,adv_exist
4,629d09668c3339dba831f6c81a307b0e,I can't stand it,1,i cant stand it,5,1,1,1,0,1
5,cd07a9b7ff61a13ca450f37f61416885,This is much more important than it's faithful...,1,this is much more important than its faithfuln...,12,0,1,0,0,1
8,68244b723d41f2fe4a6d475e7dee585e,You don't understand the signals you're sendin...,1,you dont understand the signals youre sending ...,28,0,7,1,1,1
11,bf43ec7553984de758fa1e26896607ae,you've altered it in such a way that I now fee...,1,youve altered it in such a way that i now feel...,27,0,4,1,1,1
14,3f1c5eb463a9a0ec84aee55db345802f,I wonder how you've make such good friends and...,1,i wonder how youve make such good friends and ...,21,0,5,1,1,1
...,...,...,...,...,...,...,...,...,...,...
2603,b459091930d06b8d9b114f225f0f10af,they are extremely overhyped,1,they are extremely overhyped,4,0,2,1,1,1
2604,958120c6a5f8da832e2789668ac12657,(2) Attaining fame doesn't matter and doesn't ...,1,2 attaining fame doesnt matter and doesnt trul...,13,0,5,1,0,1
2611,18070b68e7c3dacd10b8f1f528b772c1,if someone is generally in a better state iden...,1,if someone is generally in a better state iden...,30,1,7,1,0,1
2612,3a6af158aa2f8e4b652b37e966180941,when the alternative is Adventure Time or My L...,1,when the alternative is adventure time or my l...,28,0,5,1,1,1


In [51]:
# this part contains junk code for testing

text = "him personally believe had would feel death penalty should be abolished"
doc1 = nlp(text)
md_count = 0
for token in doc1:
    print(token.pos_, token.tag_)
    if token.tag_ == 'MD':
        md_count = md_count + 1

print(md_count)

spacy.explain('VBP')

# junk code ends here

PRON PRP
ADV RB
VERB VBP
VERB VBD
VERB MD
VERB VB
NOUN NN
NOUN NN
VERB MD
VERB VB
VERB VBN
2


'verb, non-3rd person singular present'