# Homework 4 - Task 1

#### Peter Grantcharov (pdg2116), Po-Chieh Liu (pl2441)

In [1]:
# import package
import numpy as np
import pandas as pd

#sklearn
from sklearn.preprocessing import LabelEncoder

In [2]:
# import training data
df_train = pd.read_csv('reddit_200k_train.csv', 
                       encoding = 'latin-1',
                       usecols = ['body', 'REMOVED'])
# import testing data
df_test = pd.read_csv('reddit_200k_test.csv', 
                      encoding = 'latin-1', 
                      usecols = ['body', 'REMOVED'])

In [3]:
# convert data into numpy vector
y_train = df_train['REMOVED']
y_test = df_test['REMOVED']

# convert boolean to 0, 1 label
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [4]:
# extract body
X = df_train['body'].values
X_t = df_test['body'].values

In [5]:
# spacy
import spacy
nlp = spacy.load('en_core_web_lg')

# we dont need ner step in pipe
_ = nlp.remove_pipe('ner')

In [6]:
def clean_component(X):
    # convert everything into spacy nlp
    doc = nlp(X[0])
    flag = None
    count = 1
    for tok in doc:
        # remove all stopwords, punctuation and symbol
        if (tok.is_stop is False and tok.pos_ != "PUNCT" and tok.pos_ != "SYM"):
            if flag is None:
                flag = True
                vec = tok.vector
            else:
                # stack all vectors together, and return mean values in the end
                vec = np.vstack((vec, tok.vector))
    if flag is None:
        # if all tokens are in stopwords, just use the raw vector
        return doc.vector
    return np.mean(vec, axis = 0)

In [7]:
# convert the text to token, then convert token to vector, then average the vectors
X_train = np.apply_along_axis(clean_component, axis = 1, arr = X.reshape(-1,1))
X_test = np.apply_along_axis(clean_component, axis = 1, arr = X_t.reshape(-1,1))


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline

In [15]:
# pipeline
lr_pipe = make_pipeline(LogisticRegression(class_weight= 'balanced',
                                           solver = 'lbfgs'))

# tune parameter 
param_grid = {"logisticregression__C": np.logspace(-2 , 2, 10)}

# gridsearch
gs = GridSearchCV(lr_pipe, 
                  param_grid, 
                  cv = 3,
                  return_train_score=True,
                  scoring='f1',
                  verbose=2,
                  n_jobs=5)
#fit
_ = gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:  1.6min finished


In [16]:
print(gs.best_params_)
print(gs.best_score_)

{'logisticregression__C': 0.21544346900318834}
0.6013630764011264


In [17]:
gs_train_f1 = f1_score(y_train.reshape(-1,1),
                       gs.predict(X_train),
                       average='weighted')
gs_test_f1 = f1_score(y_test.reshape(-1,1),
                      gs.predict(X_test),
                      average='weighted')
print(gs_train_f1)
print(gs_test_f1)

0.668330743861623
0.667726646203207


## Summary
As we can see, our score in this task was quite good, although still slightly lower than the score that was achieved with the bag of words approach found in task 1. 
When considering the model composed in task 1.3, this word embedding was quite a bit lower, although incorporating the word embedding in conjunction with those derived features will likely be more successful.