In [4]:
import pandas as pd
import numpy as np
import re
import spacy
import nltk
import ssl
from sklearn import svm
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import normalize

nlp = spacy.load('en_core_web_sm')
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('wordnet')
nltk.download('stopwords')
lst_stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package wordnet to /Users/rahul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', text.lower().strip())

    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]

    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]

    ## back to string from list
    text = " ".join(lst_text)
    return text

In [6]:
def fetch_all_neg_words():
    list_neg_words = []
    with open("neg_key_words.txt","r") as file:
        for line in file:
            list_neg_words.append(line.strip())
    return list_neg_words

In [7]:
def get_number_of_neg_words(doc):
    list_neg_words = fetch_all_neg_words()
    sum = 0
    for token in doc:
        if str(token) in list_neg_words:
            sum += 1
    return sum

In [8]:
#Used to read training data
def read_data(filename):
    train_df = pd.read_json(filename)
    y_train = train_df['label']
    body_train = []
    contro_list = []
    ups_list = []
    viol_list = []
    main_list = []
    for i in range(0, len(train_df)):
        id = train_df['id'][i]
        size_of_posts_per_thread = len(train_df['preceding_posts'][i])
        post_text = []
        sum_contro = 0
        sum_ups = 0
        sum_viol = 0
        for j in range(0, size_of_posts_per_thread): 
            text_body = train_df['preceding_posts'][i][j]['body']
            sum_contro += train_df['preceding_posts'][i][j]['controversiality']
            sum_ups += train_df['preceding_posts'][i][j]['ups']
            sum_viol += train_df['preceding_posts'][i][j]['violated_rule']
            post_text.append(text_body)
        body_text = ' '.join(post_text)
        preprocessed_body_text = utils_preprocess_text(body_text,lst_stopwords=lst_stopwords)
        doc = nlp(preprocessed_body_text)
        negative_keywords_count = get_number_of_neg_words(doc)
        main_list.append({
            "id":id,
            "body":preprocessed_body_text, 
            "controversiality":sum_contro, 
            "ups":sum_ups, 
            "violated_rule":sum_viol,
            "negative_keywords_count":negative_keywords_count,
            "vector":doc.vector
        })
    X_df = pd.DataFrame(main_list)
    return X_df, y_train

In [9]:
x_train, y_train = read_data("train-data-prepared.json")
x_val, y_val = read_data("val-data-prepared.json")

In [10]:
x_train

Unnamed: 0,id,body,controversiality,ups,violated_rule,negative_keywords_count,vector
0,t1_dggp3q9,right woman nonsexual creature would never use...,0,1,0,2,"[0.6291083, 0.0771986, 0.19754855, -0.05052166..."
1,t1_dk3zd9h,making prostitution legal make much difficult ...,0,11,0,1,"[0.45894313, -0.06847714, 0.2195477, -0.068728..."
2,t1_d86bsqs,httpsenwikipediaorgwikifederalist_no_68 specif...,0,3,0,12,"[0.7297775, 0.21284659, 0.10354026, -0.1035865..."
3,t1_cpzy2ya,real property logical absolute necessity regis...,0,3,0,31,"[0.49827892, 0.1120066, 0.12605643, -0.0839523..."
4,t1_d92nfmh,really defending multinationals greedy try nes...,0,3,0,8,"[0.34527984, -0.051554915, 0.11279477, -0.0491..."
...,...,...,...,...,...,...,...
1931,t1_cpet2nu,totally right completely flubbed sure totally ...,0,2,0,51,"[0.38978824, 0.03186239, 0.12352137, -0.056145..."
1932,t1_ck91k4x,think argument fine principle number provide f...,0,3,5,42,"[0.41911715, 0.121182114, 0.15129055, -0.08296..."
1933,t1_ch7503g,im stuck debate month cant think right moral p...,0,2,0,29,"[0.26443037, -0.025680082, 0.18942213, -0.0466..."
1934,t1_denmvjy,even saying wasnt 100 false still doesnt make ...,0,8,0,6,"[0.3212733, 0.100675665, 0.14381512, -0.024241..."


In [11]:
def generate_X(X_df):
    X = []
    for i in range(0, len(X_df)):
        controversiality = X_df["controversiality"][i]
        ups = X_df["ups"][i].item()
        violated_rule = X_df["violated_rule"][i].item()
        negative_keywords_count = X_df["negative_keywords_count"][i].item()
        vector = X_df["vector"][i].tolist()
        vector.append(controversiality)
        vector.append(ups)
        vector.append(violated_rule)
        vector.append(negative_keywords_count)
        X.append(vector)
    return np.array(X)

X_train = generate_X(x_train)
X_test = generate_X(x_val)

y = y_train.to_numpy().T
y_test = y_val.to_numpy().T

In [12]:
estimator = svm.SVC(kernel='rbf', C=100000)
y_pred = estimator.fit(X_train,y).predict(X_test)
f1_score(y_pred=y_pred, y_true=y_test)

0.6204081632653061

In [13]:
estimator = svm.SVC(kernel='linear')
y_pred = estimator.fit(X_train,y).predict(X_test)
f1_score(y_pred=y_pred, y_true=y_test)

0.5669291338582677