# Import libraries

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
import nltk
import ssl
from sklearn import svm
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer

nlp = spacy.load('en_core_web_sm')
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('wordnet')
nltk.download('stopwords')
lst_stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Get all negative words and Parts of speech vector

In [2]:
def fetch_all_neg_words():
    list_neg_words = []
    with open("neg_key_words.txt","r") as file:
        for line in file:
            list_neg_words.append(line.strip())
    return list_neg_words

In [3]:
def get_number_of_neg_words(doc):
    list_neg_words = fetch_all_neg_words()
    sum = 0
    for token in doc:
        if str(token) in list_neg_words:
            sum += 1
    return sum

In [4]:
def get_pos_vector(text):
    """Get the Parts-of-Speech feature vectors"""
    pos_feature_dict = {'ADJ': 0, 'SPACE': 0, 'ADV': 0, 'INTJ': 0, 'SYM': 0, 'VERB': 0, 'SCONJ': 0, 'PART': 0, 'X': 0,
                        'PUNCT': 0, 'AUX': 0, 'ADP': 0, 'NUM': 0, 'PRON': 0, 'NOUN': 0, 'DET': 0, 'CCONJ': 0,
                        'PROPN': 0}
    doc = nlp(text)
    for token in doc:
        pos = token.pos_
        if pos in pos_feature_dict:
            pos_feature_dict[pos] += 1
        else:
            pos_feature_dict[pos] = 1
    values_list = []
    for k in list(pos_feature_dict.keys()):
        values_list.append(pos_feature_dict[k])
    return values_list

# Split data into features and label

In [5]:
#Used to read training data
def read_data(filename):
    train_df = pd.read_json(filename)
    y_train = train_df['label']
    main_list = []
    for i in range(0, len(train_df)):
        id = train_df['id'][i]
        size_of_posts_per_thread = len(train_df['preceding_posts'][i])
        post_text = []
        sum_contro = 0
        sum_ups = 0
        sum_viol = 0
        for j in range(0, size_of_posts_per_thread): 
            text_body = train_df['preceding_posts'][i][j]['body']
            sum_contro += train_df['preceding_posts'][i][j]['controversiality']
            sum_ups += train_df['preceding_posts'][i][j]['ups']
            sum_viol += train_df['preceding_posts'][i][j]['violated_rule']
            post_text.append(text_body)
        body_text = ' '.join(post_text)
        doc = nlp(body_text)
        negative_keywords_count = get_number_of_neg_words(doc)
        main_list.append({
            "id":id,
            "body":body_text, 
            "controversiality":sum_contro, 
            "ups":sum_ups, 
            "violated_rule":sum_viol,
            "negative_keywords_count":negative_keywords_count,
            "pos_vector": get_pos_vector(body_text),
            "vector":doc.vector
        })
    X_df = pd.DataFrame(main_list)
    return X_df, y_train

# Read Files

In [6]:
x_train, y_train = read_data("train-data-prepared.json")
x_val, y_val = read_data("val-data-prepared.json")

# Generate dataframes for features

In [9]:
def generate_X(X_df):
    X = []
    for i in range(0, len(X_df)):
        controversiality = X_df["controversiality"][i]
        ups = X_df["ups"][i].item()
        violated_rule = X_df["violated_rule"][i].item()
        negative_keywords_count = X_df["negative_keywords_count"][i].item()
        pos_vector = X_df["pos_vector"][i]
        vector = X_df["vector"][i].tolist()
        vector.append(controversiality)
        vector.append(ups)
        vector.append(violated_rule)
        vector.append(negative_keywords_count)
        vector.extend(pos_vector)
        X.append(vector)
    return np.array(X)

In [10]:
X_train = generate_X(x_train)
X_test = generate_X(x_val)

y = y_train.to_numpy().T
y_test = y_val.to_numpy().T

# Train model and predict

In [11]:
estimator = svm.SVC(kernel='rbf', C=10000)
y_pred = estimator.fit(X_train,y).predict(X_test)
f1_score(y_pred=y_pred, y_true=y_test)

0.6789667896678968

# Writing predictions to file

In [12]:
def write_file(predictions, id_df):
    """Write the predictions to JSON file"""
    result = pd.concat([id_df, pd.DataFrame(predictions)], axis=1)
    result.columns = ['id', 'label']
    result.set_index('id')['label'].to_json(r'output.json')

In [13]:
write_file(y_pred, x_val['id'])

# Cross-validation

In [14]:
# def _cross_validate(estimator, X, y, X_test, y_test):
#     parameter_space_svm = {
#         'C':[10**i for i in range(0,8)]
#     }
#     from sklearn.model_selection import GridSearchCV
#     cv = KFold(n_splits=5)
#     clf = GridSearchCV(estimator, parameter_space_svm, scoring='f1_macro', cv=cv)
#     clf.fit(X,y)
        
#     print("Best parameters set found on development set:")
#     print(clf.best_params_)
#     y_pred = clf.predict(X_test)
#     return f1_score(y_pred=y_pred, y_true=y_test)

In [15]:
#estimator = svm.SVC(kernel='rbf')
#print(_cross_validate(estimator, X=X_train, y=y, X_test=X_test, y_test=y_test))