In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics,naive_bayes
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

nlp = spacy.load("en_core_web_sm")

### Reading the training data and validation data

In [2]:
train_data_file = 'train-data-prepared.json'
val_data_file = 'val-data-prepared.json'

df_train = pd.read_json(train_data_file)
df_val = pd.read_json(val_data_file)
# display(df_train)

### Function: Pre-process the text(lower casing, lead/train space stripping,remove punctuations)

In [3]:

def PreprocessData(df):
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].str.strip()
    df['clean_text'] = df['clean_text'].str.replace('[^?\w\s]','')
    return df


### Function: Linguistic Feature Extraction

In [4]:
def FeatureExtract(df):
    
    df['token_count'] = df['clean_text'].apply(lambda x: len(nlp(x)))

    md_exist_list = [] # modal verb
    verb_count_list = [] # verb
    prp_exist_list = [] # personal pronouns
    vbp_exist_list = [] # verb non 3rd person
    adv_exist_list = [] # adverb
    conj_exist_list = [] #conjunction/preposition
    adj_exist_list = [] #adjective
    disc_exist_list = [] # discourse markers
    question_list = [] # question text

    for text in df['clean_text']:
        doc = nlp(text)
        md_exist = 0
        verb_count = 0
        prp_exist = 0
        vbp_exist = 0
        adv_exist = 0
        conj_exist = 0
        adj_exist = 0
        disc_exist = 0
        question = 0

        for token in doc:
            if token.tag_ == 'MD':
                md_exist = 1

            if token.pos_ == 'VERB':
                verb_count = 1

            if token.tag_ == 'PRP':
                prp_exist = 1

            if token.tag_ == 'VBP':
                vbp_exist = 1

            if token.pos_ == 'ADV':
                adv_exist = 1
            
            if token.dep_ == 'prep':
                conj_exist = 1
            
            if token.pos_ == 'ADJ':
                adj_exist = 1
            
            if token.tag_ == 'IN' and token.pos_!='ADP':
                disc_exist = 1
            
            if token.text == '?':
                question = 1

        md_exist_list.append(md_exist)
        verb_count_list.append(verb_count)
        prp_exist_list.append(prp_exist)
        vbp_exist_list.append(vbp_exist)
        adv_exist_list.append(adv_exist)
        conj_exist_list.append(conj_exist)
        adj_exist_list.append(adj_exist)
        disc_exist_list.append(disc_exist)
        question_list.append(question)

    df['modal_exist'] = md_exist_list
    df['verb_count'] = verb_count_list
    df['prp_exist'] = prp_exist_list
    df['vbp_exist'] = vbp_exist_list
    df['adv_exist'] = adv_exist_list
    df['conj_exist'] = conj_exist_list
    df['adj_exist'] = adj_exist_list
    df['disc_exist'] = disc_exist_list
    df['question'] = question_list
    return df


### Preprocess and feature extraction: Train and Val set

In [5]:
df_train = PreprocessData(df_train)
df_train = FeatureExtract(df_train)

df_val = PreprocessData(df_val)
df_val = FeatureExtract(df_val)
# display(df_train)
# print("next \n")
# display(df_val)

In [6]:
df_train.head(20)

Unnamed: 0,id,text,label,clean_text,token_count,modal_exist,verb_count,prp_exist,vbp_exist,adv_exist,conj_exist,adj_exist,disc_exist,question
0,1363ac6f16e232a1d8e9343d975ebe10,Since communism has been relegated to just a h...,0,since communism has been relegated to just a h...,11,0,1,0,0,1,1,0,0,0
1,6b925ce5eeb8b690b35972abafcb7c60,Can you counter that?,0,can you counter that?,5,1,1,1,0,0,0,0,0,1
2,99977c6e63734add1c1b600be79b3342,Censorship does not eliminate the censored ind...,0,censorship does not eliminate the censored ind...,7,0,1,0,0,1,0,0,0,0
3,7e34d9e198bc9e12f0868793c68d32f0,"Without the extra population from abortions, h...",0,without the extra population from abortions ha...,38,0,1,0,1,1,1,1,0,0
4,629d09668c3339dba831f6c81a307b0e,I can't stand it,1,i cant stand it,5,1,1,1,0,1,0,0,0,0
5,cd07a9b7ff61a13ca450f37f61416885,This is much more important than it's faithful...,1,this is much more important than its faithfuln...,12,0,1,0,0,1,1,1,0,0
6,99be8d453162bd56e93c932640600d8f,Every thought stands of its own individual mer...,0,every thought stands of its own individual merits,8,0,1,0,0,0,1,1,0,0
7,38763fbb689db60dcbb8adbc97fa1903,you don't have to necessarily think you're bet...,0,you dont have to necessarily think youre bette...,18,0,1,1,1,1,1,1,0,0
8,68244b723d41f2fe4a6d475e7dee585e,You don't understand the signals you're sendin...,1,you dont understand the signals youre sending ...,28,0,1,1,1,1,1,1,0,0
9,0867cb40fc364af10304cb962558d75e,neighbors went out but no one had the head to ...,0,neighbors went out but no one had the head to ...,47,0,1,1,1,1,1,0,0,0


### Feature Extraction using Bag of Words(uni and bigrams): Train and Val set

In [7]:
vectorizer = CountVectorizer(ngram_range=(1,2))
feature_matrix_bow = vectorizer.fit_transform(df_train['clean_text'])


df_features_bow = pd.DataFrame(feature_matrix_bow.toarray())
# df_features_text = df_train[["token_count","modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features_text = df_train[["modal_exist","verb_count","prp_exist","vbp_exist","adv_exist","conj_exist","adj_exist",
                             "disc_exist"]]
df_features = pd.concat([df_features_bow,df_features_text],axis = 1)

x_train = df_features.values
y_train = df_train['label'].values

# display(df_features)
# print(feature_matrix_bow.shape)
print(x_train.shape)
print(y_train.shape)

(2619, 28021)
(2619,)


In [8]:
# Transforming the Val/Test dataset with the vectorizer object

feature_matrix_bow_val = vectorizer.transform(df_val['clean_text'])

df_features_bow_val = pd.DataFrame(feature_matrix_bow_val.toarray())
# df_features_text_val = df_val[["token_count","modal_exist","verb_count","prp_exist","vbp_exist","adv_exist"]]
df_features_text_val = df_val[["modal_exist","verb_count","prp_exist","vbp_exist","adv_exist","conj_exist","adj_exist",
                              "disc_exist"]]
df_features_val = pd.concat([df_features_bow_val,df_features_text_val],axis = 1)

x_val = df_features_val.values
y_val = df_val['label'].values
print(x_val.shape)
print(y_val.shape)

(349, 28021)
(349,)


### Classifer training and predict: Logisitc Regression

In [9]:
# c_space = np.logspace(-5, 8, 15)
# param_grid = {'C': c_space, 'penalty':['l2']}

# classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
# classifier_CV = GridSearchCV(classifier, param_grid, cv=5)
# classifier_CV.fit(x_train,y_train)

## Gridsearch (you dont have to run this)

In [10]:
# parameters = {'C':[1.0, 10.0, 100.0],'gamma':[0.1, 0.01]}
# classifier = SVC()

# classifier_CV=GridSearchCV(classifier, parameters, cv=3)
# classifier_CV.fit(x_train, y_train)

In [11]:
# classifier_CV.best_params_

In [12]:
# predictions = classifier_CV.predict(x_val)

In [13]:
# metrics.f1_score(y_val,predictions)

## Actual training with the best hyperparameters combination

In [14]:
classifier = SVC(C=10.0, gamma=0.01)
classifier.fit(x_train,y_train)

SVC(C=10.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
predictions = classifier.predict(x_val)

In [16]:
metrics.f1_score(y_val,predictions)

0.5042735042735041

In [17]:
# Extracting IDs -> Generating a dictionary -> Creating a JSON output file with predictions

val_data_id = df_val['id'].values
pred_val = dict(zip(val_data_id, predictions))

with open('pred_out.json', 'w') as fp:
    json.dump(pred_val,fp,default=str)