In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

nlp = spacy.load("en_core_web_sm")

### Reading the training data and validation data

In [2]:
train_data_file = 'train-data-prepared.json'
val_data_file = 'val-data-prepared.json'

df_train = pd.read_json(train_data_file)
df_val = pd.read_json(val_data_file)

### Function: Pre-process the text(lower casing, lead/train space stripping,remove punctuations)

In [3]:

def PreprocessData(df):
    df['clean_text'] = df['text'].str.lower()
    df['clean_text'] = df['clean_text'].str.strip()
    df['clean_text'] = df['clean_text'].str.replace(r"http\S+",'')
    df['clean_text'] = df['clean_text'].str.replace('[^\w\s]','')
    return df


### Function: Linguistic Feature Extraction

In [4]:
def FeatureExtract(df):
    md_exist_list = [] # modal verb
    verb_exist_list = [] # verb
    prp_exist_list = [] # personal pronouns
    vbp_exist_list = [] # verb non 3rd person
    adv_exist_list = [] # adverb
    conj_exist_list = [] #conjunction/preposition
    adj_exist_list = [] #adjective
    disc_exist_list = [] # discourse markers

    for text in df['clean_text']:
        doc = nlp(text)
        md_exist = 0
        verb_exist = 0
        prp_exist = 0
        vbp_exist = 0
        adv_exist = 0
        conj_exist = 0
        adj_exist = 0
        disc_exist = 0

        for token in doc:
            if token.tag_ == 'MD':
                md_exist = 1

            if token.pos_ == 'VERB':
                verb_exist = 1

            if token.tag_ == 'PRP':
                prp_exist = 1

            if token.tag_ == 'VBP':
                vbp_exist = 1

            if token.pos_ == 'ADV':
                adv_exist = 1
            
            if token.dep_ == 'prep':
                conj_exist = 1
            
            if token.pos_ == 'ADJ':
                adj_exist = 1
            
            if token.tag_ == 'IN' and token.pos_!='ADP':
                disc_exist = 1

        md_exist_list.append(md_exist)
        verb_exist_list.append(verb_exist)
        prp_exist_list.append(prp_exist)
        vbp_exist_list.append(vbp_exist)
        adv_exist_list.append(adv_exist)
        conj_exist_list.append(conj_exist)
        adj_exist_list.append(adj_exist)
        disc_exist_list.append(disc_exist)

    df['modal_exist'] = md_exist_list
    df['verb_exist'] = verb_exist_list
    df['prp_exist'] = prp_exist_list
    df['vbp_exist'] = vbp_exist_list
    df['adv_exist'] = adv_exist_list
    df['conj_exist'] = conj_exist_list
    df['adj_exist'] = adj_exist_list
    df['disc_exist'] = disc_exist_list
    return df


In [5]:
display(df_train)



Unnamed: 0,id,text,label
0,1363ac6f16e232a1d8e9343d975ebe10,Since communism has been relegated to just a h...,0
1,6b925ce5eeb8b690b35972abafcb7c60,Can you counter that?,0
2,99977c6e63734add1c1b600be79b3342,Censorship does not eliminate the censored ind...,0
3,7e34d9e198bc9e12f0868793c68d32f0,"Without the extra population from abortions, h...",0
4,629d09668c3339dba831f6c81a307b0e,I can't stand it,1
...,...,...,...
2614,452684ca088e4ab1c58d89e1a28e1ef7,"it's ""true"" or not and that ""truth"" is availab...",0
2615,b5fa34bde09f97ab565ec1ae433d1797,And these slogans don't even denote any sense ...,0
2616,f676671baa396678dcb3471ea67e70ed,&gt;whole-bodyWhile,0
2617,a4e0fa2814bb40ae76750ea1597084de,that the majority of them are affected negativ...,1


### Preprocess and feature extraction: Train and Val set

In [6]:
df_train = PreprocessData(df_train)
df_train = FeatureExtract(df_train)

df_val = PreprocessData(df_val)
df_val = FeatureExtract(df_val)

### Feature Extraction using Bag of Words(uni and bigrams): Train and Val set

In [7]:
vectorizer = CountVectorizer(ngram_range=(1,2))
feature_matrix_bow = vectorizer.fit_transform(df_train['clean_text'])


df_features_bow = pd.DataFrame(feature_matrix_bow.toarray())
df_features_text = df_train[["modal_exist","verb_exist","prp_exist","vbp_exist","adv_exist","conj_exist","adj_exist",
                             "disc_exist"]]
df_features = pd.concat([df_features_bow,df_features_text],axis = 1)

x_train = df_features.values
y_train = df_train['label'].values

In [8]:
# Transforming the Val/Test dataset with the vectorizer object

feature_matrix_bow_val = vectorizer.transform(df_val['clean_text'])

df_features_bow_val = pd.DataFrame(feature_matrix_bow_val.toarray())
df_features_text_val = df_val[["modal_exist","verb_exist","prp_exist","vbp_exist","adv_exist","conj_exist","adj_exist",
                              "disc_exist"]]
df_features_val = pd.concat([df_features_bow_val,df_features_text_val],axis = 1)

x_val = df_features_val.values
y_val = df_val['label'].values

### Actual training with the best hyperparameters combination

In [9]:
classifier = SVC(C=10.0, gamma=0.01)
# classifier = LogisticRegression(C = 20.0, penalty = 'l2',max_iter=1000)
classifier.fit(x_train,y_train)

LogisticRegression(C=20.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
predictions = classifier.predict(x_val)
print(metrics.f1_score(y_val,predictions))

0.49781659388646293


In [11]:
# Extracting IDs -> Generating a dictionary -> Creating a JSON output file with predictions

val_data_id = df_val['id'].values
pred_val = dict(zip(val_data_id, predictions))

with open('pred_out.json', 'w') as fp:
    json.dump(pred_val,fp,default=str)