In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import numpy as np

In [2]:
train_df = pd.read_csv('./data/training_features_B.tsv', sep="\t")
dev_df = pd.read_csv('./data/dev_features_B.tsv', sep="\t")
print(dev_df)

       Unnamed: 0       story  sent_index  token_index   token-2     token-1  \
0               0  wisteria01           0            0         X           X   
1               1  wisteria01           0            1         X          1.   
2               2  wisteria01           0            2        1.         The   
3               3  wisteria01           0            3       The    Singular   
4               4  wisteria01           0            4  Singular  Experience   
...           ...         ...         ...          ...       ...         ...   
13562       13562  wisteria02         439            9       was        very   
13563       13563  wisteria02         439           10      very    orthodox   
13564       13564  wisteria02         439           11  orthodox          in   
13565       13565  wisteria02         439           12        in         his   
13566       13566  wisteria02         439           13       his      ritual   

            token     token+1     token

In [3]:
# dropping columns other than token feature and targets
train_df = train_df.drop(columns=["story", "sent_index", "token_index"], axis=1)
dev_df = dev_df.drop(columns=["story", "sent_index", "token_index"], axis=1)

In [4]:
train_df = train_df.fillna('X')
dev_df = dev_df.fillna('X')
print(train_df)

       Unnamed: 0  token-2   token-1     token   token+1   token+2  pos  \
0               0        X         X   Chapter        1.       Mr.   NN   
1               1        X   Chapter        1.       Mr.  Sherlock   CD   
2               2  Chapter        1.       Mr.  Sherlock    Holmes  NNP   
3               3       1.       Mr.  Sherlock    Holmes       Mr.  NNP   
4               4      Mr.  Sherlock    Holmes       Mr.  Sherlock  NNP   
...           ...      ...       ...       ...       ...       ...  ...   
65446       65446      the    russet    slopes        of       the  NNS   
65447       65447   russet    slopes        of       the      moor   IN   
65448       65448   slopes        of       the      moor         .   DT   
65449       65449       of       the      moor         .         X   NN   
65450       65450      the      moor         .         X         X    .   

          chunk     lemma  matchesNeg  hasPrefix  hasSuffix  hasPrefixAntonym  \
0            NP   

In [5]:
train_instances = train_df[["token", "token-2", "token-1", "token+1", "token+2", "pos", "chunk", "lemma", "matchesNeg", "hasPrefix", "hasSuffix", "hasPrefixAntonym", "hasSuffixAntonym", "matchesMulticue"]].to_dict('records')
dev_instances = dev_df[["token", "token-2", "token-1", "token+1", "token+2", "pos", "chunk", "lemma", "matchesNeg", "hasPrefix", "hasSuffix", "hasPrefixAntonym", "hasSuffixAntonym", "matchesMulticue"]].to_dict('records')

In [6]:
vec = DictVectorizer()

In [7]:
X_train = vec.fit_transform(train_instances)

In [8]:
X_train.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
Y_train = train_df.bio.tolist()
Y_dev = dev_df.bio.tolist()

In [10]:
classifier = LinearSVC(max_iter = 10000)

In [11]:
parameters = dict(C=(0.01, 0.1, 1.0), loss=('hinge', 'squared_hinge'), tol=(0.0001,0.001,0.01,0.1))

In [12]:
### The GridSearchCV was inspired by the sklearn documentation and lecture by Ilia Markov 
grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=5, scoring='f1_macro')

In [13]:
grid.fit(X_train, Y_train)

GridSearchCV(cv=5, estimator=LinearSVC(max_iter=10000),
             param_grid={'C': (0.01, 0.1, 1.0),
                         'loss': ('hinge', 'squared_hinge'),
                         'tol': (0.0001, 0.001, 0.01, 0.1)},
             scoring='f1_macro')

In [14]:
classifier = grid.best_estimator_
print("Best parameters", grid.best_params_)

Best parameters {'C': 1.0, 'loss': 'hinge', 'tol': 0.0001}


In [15]:
X_dev = vec.transform(dev_instances)

In [16]:
predictions = classifier.predict(X_dev)

In [17]:
dev_df['SVM'] = predictions

correct_values = dev_df['token'][(dev_df["bio"]=='B-NEG') & (dev_df['SVM'] == 'B-NEG')]
correct_values.to_csv('correct_values.csv')
non_correct_values = dev_df[['token', 'bio', 'SVM']][dev_df["bio"] != dev_df['SVM']]
non_correct_values.to_csv('non_correct_values.csv')

In [18]:
target_names = ["token", "token-2", "token-1", "token+1", "token+2", "pos", "chunk", "lemma", "matchesNeg", "hasPrefix", "hasSuffix", "hasPrefixAntonym", "hasSuffixAntonym", "matchesMutlicue"]
report = pd.DataFrame(classification_report(y_true=dev_df['bio'], y_pred=dev_df['SVM'], output_dict=True)).transpose()

In [19]:
print(report)
print()
print("Features:", target_names)

              precision    recall  f1-score       support
B-NEG          0.945455  0.886364  0.914956    176.000000
I-NEG          1.000000  0.666667  0.800000      3.000000
O              0.998433  0.999328  0.998880  13388.000000
accuracy       0.997789  0.997789  0.997789      0.997789
macro avg      0.981296  0.850786  0.904612  13567.000000
weighted avg   0.997746  0.997789  0.997747  13567.000000

Features: ['token', 'token-2', 'token-1', 'token+1', 'token+2', 'pos', 'chunk', 'lemma', 'matchesNeg', 'hasPrefix', 'hasSuffix', 'hasPrefixAntonym', 'hasSuffixAntonym', 'matchesMutlicue']
