In [51]:
import pandas as pd
import re
import spacy
import nltk
import ssl
from sklearn.linear_model import Perceptron
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.preprocessing import normalize

nlp = spacy.load('en_core_web_sm')
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('wordnet')
nltk.download('stopwords')
lst_stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package wordnet to /Users/rahul/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [63]:
def fetch_all_neg_words():
    list_neg_words = []
    with open("neg_key_words.txt","r") as file:
        for line in file:
            list_neg_words.append(line.strip())
    return list_neg_words

In [64]:
#Used to read training data
def read_data(filename):
    train_df = pd.read_json(filename)
    y_train = train_df['label']
    body_train = []
    contro_list = []
    ups_list = []
    viol_list = []
    main_list = []
    for i in range(0, 2):#len(train_df)):
        id = train_df['id'][i]
        size_of_posts_per_thread = len(train_df['preceding_posts'][i])
        # print(size_of_posts_per_thread)
        post_text = []
        sum_contro = 0
        sum_ups = 0
        sum_viol = 0
        for j in range(0, size_of_posts_per_thread): 
            text_body = train_df['preceding_posts'][i][j]['body']
            sum_contro += train_df['preceding_posts'][i][j]['controversiality']
            sum_ups += train_df['preceding_posts'][i][j]['ups']
            sum_viol += train_df['preceding_posts'][i][j]['violated_rule']
            post_text.append(text_body)
        body_text = ' '.join(post_text)
        preprocessed_body_text = utils_preprocess_text(body_text,lst_stopwords=lst_stopwords)
        doc = nlp(preprocessed_body_text)
        main_list.append({
            "id":id,
            "body":preprocessed_body_text, 
            "controversiality":sum_contro, 
            "ups":sum_ups, 
            "violated_rule":sum_viol,
            "vector":doc.vector
        })
    X_df = pd.DataFrame(main_list)
    return X_df, y_train

In [53]:
x_train, y_train = read_data("train-data-prepared.json")
x_val, y_val = read_data("val-data-prepared.json")

In [54]:
x_train

Unnamed: 0,id,body,controversiality,ups,violated_rule,vector
0,t1_dggp3q9,right woman nonsexual creature would never use...,0,1,0,"[0.6291083, 0.0771986, 0.19754855, -0.05052166..."
1,t1_dk3zd9h,making prostitution legal make much difficult ...,0,11,0,"[0.45894313, -0.06847714, 0.2195477, -0.068728..."


In [14]:
y_train

0       1
1       0
2       1
3       0
4       1
       ..
1931    0
1932    1
1933    0
1934    1
1935    0
Name: label, Length: 1936, dtype: int64

In [3]:
val_df = read_val_data()
val_df

Unnamed: 0,id,preceding_posts,final_post,label
0,t1_dipwvtv,"[{'archived': False, 'author_name': 'mattman11...","{'archived': False, 'author_name': 'Blood_and_...",1
1,t1_dctegi4,"[{'archived': True, 'author_name': 'betweentwo...","{'archived': True, 'author_name': 'betweentwol...",0
2,t1_d4vri90,"[{'archived': True, 'author_name': 'Sheexthro'...","{'archived': True, 'author_name': 'amus', 'bod...",1
3,t1_d2v90lz,"[{'archived': True, 'author_name': 'cdb03b', '...","{'archived': True, 'author_name': 'cdb03b', 'b...",0
4,t1_dd1k4g6,"[{'archived': True, 'author_name': 'Leumashy',...","{'archived': True, 'author_name': 'Redwing4114...",1
...,...,...,...,...
253,t1_cgmqm3y,"[{'archived': True, 'author_name': 'telegraphi...","{'archived': True, 'author_name': 'telegraphis...",0
254,t1_cvoj5re,"[{'archived': True, 'author_name': 'gtfooh1011...","{'archived': True, 'author_name': 'Osricthebas...",1
255,t1_cpcigu7,"[{'archived': True, 'author_name': 'Ananasboat...","{'archived': True, 'author_name': 'Ananasboat'...",0
256,t1_cnu1fi5,"[{'archived': True, 'author_name': 'Lagkiller'...","{'archived': True, 'author_name': 'anonoman925...",1


In [4]:
preprocess_train_data(train_df)

              id  label
0     t1_dggp3q9      1
1     t1_dk3zd9h      0
2     t1_d86bsqs      1
3     t1_cpzy2ya      0
4     t1_d92nfmh      1
...          ...    ...
1931  t1_cpet2nu      0
1932  t1_ck91k4x      1
1933  t1_ch7503g      0
1934  t1_denmvjy      1
1935  t1_crtmi2e      0

[1936 rows x 2 columns]
1936
