In [51]:
import spacy
import pandas as pd
import numpy as np
import en_core_web_sm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn.metrics import f1_score, precision_score, recall_score, make_scorer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

nlp = spacy.load('en_core_web_sm')




In [52]:
def get_pos_vector(text):
    pos_feature_dict = {'ADJ':0, 'SPACE':0, 'ADV':0, 'INTJ':0, 'SYM':0, 'VERB':0, 'SCONJ':0, 'PART':0, 'X':0, 'PUNCT':0, 'AUX':0, 'ADP':0, 'NUM':0, 'PRON':0, 'NOUN':0, 'DET':0, 'CCONJ':0, 'PROPN':0}
    doc = nlp(text)
    for token in doc:
        pos = token.pos_
        if pos in pos_feature_dict:
            pos_feature_dict[pos] += 1
        else:
            pos_feature_dict[pos] = 1
    vals_list = []
    for k in list(pos_feature_dict.keys()):
        vals_list.append(pos_feature_dict[k])
    return vals_list


def get_tag_vector(text):
    pos_feature_dict = {'VBP':0, 'RBS':0, 'VBZ':0, 'WRB':0, 'VB':0, 'NNS':0, 'WDT':0, 'UH':0, '-RRB-':0, 'AFX':0, 'CC':0, 'WP':0, 'VBN':0, 'IN':0, 'PRP$':0, 'XX':0, 'WP$':0, 'RBR':0, 'PDT':0, 'HYPH':0, 'POS':0, '$':0, 'NNPS':0, 'MD':0, '.':0, 'VBD':0, 'JJR':0, 'NFP':0, ',':0, 'JJS':0, 'DT':0, '_SP':0, 'VBG':0, 'FW':0, 'RP':0, 'SYM':0, 'LS':0, 'CD':0, 'RB':0, 'EX':0, '``':0, 'PRP':0, "''":0, ':':0, 'TO':0, 'JJ':0, 'ADD':0, '-LRB-':0, 'NN':0, 'NNP':0}
    doc = nlp(text)
    for token in doc:
        pos = token.tag_
        if pos in pos_feature_dict:
            pos_feature_dict[pos] += 1
        else:
            pos_feature_dict[pos] = 1
    vals_list = []
    for k in list(pos_feature_dict.keys()):
        vals_list.append(pos_feature_dict[k])
    return vals_list

def get_pos_tag_vector(text):
    pos_vector = get_pos_vector(text)
    tag_vector = get_tag_vector(text)
    pos_vector.extend(tag_vector)
    return pos_vector

def get_set(text_list):
    s = []
    for text in text_list:
        doc = nlp(text)
        for token in doc:
            pos = token.tag_
            s.append(pos)
    s = set(s)
    print(s)
    return s

In [53]:
#Random forest
print("Random Forest")

#Training data
df_train = pd.read_json('train-data-prepared.json')
text_list = df_train['text'].to_list()
vectors_list = [get_pos_tag_vector(text) for text in text_list]
df_train['feature_vector'] = vectors_list
y_train = df_train['label']
X_train = np.stack(vectors_list,axis=0)
X_train = normalize(X_train, axis=1, norm='l1')

#Validation data
df_test = pd.read_json('val-data-prepared.json')
text_list = df_test['text'].to_list()
vectors_list_test = [get_pos_tag_vector(text) for text in text_list]
df_test['feature_vector'] = vectors_list_test
X_test = np.stack(vectors_list_test,axis=0)
X_test = normalize(X_test, axis=1, norm='l1')
y_test = df_test['label']
print("y_test {0}".format(y_test))

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
y1_pred = []
for x in y_pred:
    if x > 0.50:
        x=1
        y1_pred.append(x)
    else:
        x=0
        y1_pred.append(x)

print("y_pred {0}".format(y1_pred))

print(confusion_matrix(y_test,y1_pred))
print(classification_report(y_test,y1_pred))
print(accuracy_score(y_test, y1_pred))

Random Forest
y_test 0      1
1      1
2      0
3      0
4      0
      ..
344    1
345    0
346    0
347    0
348    0
Name: label, Length: 349, dtype: int64
y_pred [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,