### XGBoost

In [21]:
import numpy as np
import pandas as pd

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

from sklearn.metrics import classification_report

In [22]:
data = pd.read_csv('../../../data/Task2.csv', index_col='sentence_id')
data = data.rename(columns={'id':'Sentence #'})
data = data.dropna(axis=0)

In [23]:
sentences = []
labels = list(data['type']) 
typs = data['type'].values

x = data.apply(lambda l: sentences.append(l['sentence'].split(' ')),axis=1)
words = []
for sent in sentences:
    for wrd in sent:
        words.append(wrd)
words = list(set(words))
tags = list(set(labels))

n_words = len(words)
n_tags = len(tags)
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 30
X = [[word2idx[w] for w in s] for s in sentences]    
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=0)
y = [tag2idx[tg] for tg in labels]
# y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
y = np.array([to_categorical(i, num_classes=n_tags) for i in y])
y = [[y[i],typs[i]] for i in range(len(y))]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = np.array([lab[0] for lab in y_train])
typ_test = [lab[1] for lab in y_test]
y_test = np.array([lab[0] for lab  in y_test])

In [24]:
clf = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27
)

In [25]:
clf.fit(X_train,y_train)
pred = clf.predict(X_test)

In [26]:
inc_test = []
exc_test = []
inc_pred = []
exc_pred = []
for i,t in enumerate(typ_test):
    if t == 'I':
        inc_test.append(y_test[i])
        inc_pred.append(pred[i])
    else:
        exc_test.append(y_test[i])
        exc_pred.append(pred[i])
print("For inclusions")
report = classification_report(inc_test,inc_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
display(df)
print("For exclusions")
report = classification_report(exc_test,exc_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
display(df)

For inclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.628205,0.771654,234.0
1,0.0,0.0,0.0,0.0
micro avg,0.615063,0.628205,0.621564,234.0
macro avg,0.5,0.314103,0.385827,234.0
weighted avg,1.0,0.628205,0.771654,234.0
samples avg,0.600427,0.628205,0.609687,234.0


For exclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.577093,0.731844,227.0
micro avg,0.59276,0.577093,0.584821,227.0
macro avg,0.5,0.288546,0.365922,227.0
weighted avg,1.0,0.577093,0.731844,227.0
samples avg,0.570485,0.577093,0.572687,227.0


### SVM

In [27]:
def loadGloveModel(File):
    with open(File, 'r', encoding='utf-8') as f:
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(val) for val in splitLines[1:]])
            gloveModel[word] = wordEmbedding
    print(len(gloveModel), "words loaded!")
    return gloveModel

vec_model = loadGloveModel('glove/glove.6B.300d.txt')

400000 words loaded!


In [28]:
tag2idx = {t: i for i, t in enumerate(tags)}
max_len = 30
emb_dim = len(vec_model['the'])
X = []
y = []

for sent in sentences:
    vec = np.zeros((30,emb_dim))
    for i, word in enumerate(sent):
        if word in vec_model:
            vec[i,:] = vec_model[word]
    X.append(vec)
X = np.array(X)
y = np.array([tag2idx[tg] for tg in labels])
y = [[y[i],typs[i]] for i in range(len(y))]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = np.array([lab[0] for lab in y_train])
typ_test = [lab[1] for lab in y_test]
y_test = np.array([lab[0] for lab  in y_test])
X_train = X_train.reshape((X_train.shape[0],max_len*emb_dim))
X_test = X_test.reshape((X_test.shape[0],max_len*emb_dim))

In [30]:
from sklearn.svm import SVC

In [31]:
def full_results(y_test,pred):
    
    report = classification_report(y_test,pred.flatten('F'),output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    
    inc_test = []
    exc_test = []
    inc_pred = []
    exc_pred = []
    for i,t in enumerate(typ_test):
        if t == 'I':
            inc_test.append(y_test[i])
            inc_pred.append(pred[i])
        else:
            exc_test.append(y_test[i])
            exc_pred.append(pred[i])
    print("For inclusions")
    report = classification_report(inc_test,inc_pred,output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    print("For exclusions")
    report = classification_report(exc_test,exc_pred,output_dict=True)
    df = pd.DataFrame(report).transpose()
    display(df)
    return

In [35]:
clf = SVC(kernel='poly')
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
full_results(y_test,pred)

Unnamed: 0,precision,recall,f1-score,support
0,0.64,0.827586,0.721805,232.0
1,0.751553,0.528384,0.620513,229.0
accuracy,0.678959,0.678959,0.678959,0.678959
macro avg,0.695776,0.677985,0.671159,461.0
weighted avg,0.695413,0.678959,0.671488,461.0


For inclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,1.0,0.827586,0.90566,232.0
1,0.0,0.0,0.0,0.0
accuracy,0.827586,0.827586,0.827586,0.827586
macro avg,0.5,0.413793,0.45283,232.0
weighted avg,1.0,0.827586,0.90566,232.0


For exclusions


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.528384,0.691429,229.0
accuracy,0.528384,0.528384,0.528384,0.528384
macro avg,0.5,0.264192,0.345714,229.0
weighted avg,1.0,0.528384,0.691429,229.0
