In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils import Loader, FeatureExtractor, ID_TO_LABEL, f_scorer

In [2]:
loader_train = Loader().load_dataset('./data/train/train_stances.csv', './data/train/train_bodies.csv')

In [3]:
loader_train.headlines[0], loader_train.id_to_body[int(loader_train.headlines[0]['Body ID'])]

({'Body ID': 712,
  'Headline': "Police find mass graves with at least '15 bodies' near Mexico town where 43 students disappeared after police clash",
  'Stance': 'unrelated'},
 'Danny Boyle is directing the untitled film\n\nSeth Rogen is being eyed to play Apple co-founder Steve Wozniak in Sony’s Steve Jobs biopic.\n\nDanny Boyle is directing the untitled film, based on Walter Isaacson\'s book and adapted by Aaron Sorkin, which is one of the most anticipated biopics in recent years.\n\nNegotiations have not yet begun, and it’s not even clear if Rogen has an official offer, but the producers — Scott Rudin, Guymon Casady and Mark Gordon — have set their sights on the talent and are in talks.\n\nOf course, this may all be for naught as Christian Bale, the actor who is to play Jobs, is still in the midst of closing his deal. Sources say that dealmaking process is in a sensitive stage.\n\nInsiders say Boyle will is flying to Los Angeles to meet with actress to play one of the female leads,

In [4]:
loader_test = Loader().load_dataset('./data/test/test_stances.csv', './data/test/test_bodies.csv')

In [5]:
fe = FeatureExtractor(5005)
train_features, train_stances, valid_features, valid_stances = fe.get_train_fit_vect(loader_train, loader_test)

In [6]:
train_features[0].sum()

9.0255298052456059

In [7]:
valid_features[0].sum()

10.2558829302017

In [8]:
test_features, test_stances = fe.transform(loader_test, True)

In [9]:
test_features[0].sum()

12.139425247670783

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
from sklearn.metrics import classification_report,confusion_matrix
valid_stances_y = [ID_TO_LABEL[x] for x in valid_stances]
test_stances_y = [ID_TO_LABEL[x] for x in test_stances]

In [12]:
from sklearn.metrics import f1_score

In [13]:
def score(X, y):
    pred = model.predict(X)
    pred_l = [ID_TO_LABEL[x] for x in pred]
    
    print(classification_report(y_pred=pred_l, y_true=y))
    print(confusion_matrix(y_pred=pred_l, y_true=y))
    print('score:{}'.format(f_scorer(y_pred=pred_l, y_true=y, labels=True)))
    return f1_score(y_pred=pred_l, y_true=y, average='weighted')

In [14]:
C = [0.01,0.1,1,10,100]
best_params = {
    'score':-1
}

for c in C:
    model = LogisticRegression(C=c, class_weight='balanced').fit(train_features,train_stances)
    sc = score(X=valid_features, y=valid_stances_y)
    if best_params['score'] < sc:
        best_params['score'] = sc
        best_params['C'] = c
        
print("**********BEST FOUND**********")
print(best_params)

             precision    recall  f1-score   support

      agree       0.74      0.39      0.51       770
   disagree       0.29      0.45      0.35       154
    discuss       0.81      0.72      0.76      1700
  unrelated       0.94      1.00      0.97      7371

avg / total       0.89      0.89      0.89      9995

[[ 301   89  227  153]
 [  30   69   35   20]
 [  76   80 1225  319]
 [   1    0   32 7338]]
score:3563.75
             precision    recall  f1-score   support

      agree       0.75      0.56      0.65       770
   disagree       0.37      0.62      0.46       154
    discuss       0.85      0.81      0.83      1700
  unrelated       0.97      0.99      0.98      7371

avg / total       0.93      0.92      0.92      9995

[[ 435   99  181   55]
 [  33   95   19    7]
 [ 109   64 1385  142]
 [   1    1   46 7323]]
score:3872.0
             precision    recall  f1-score   support

      agree       0.82      0.75      0.78       770
   disagree       0.44      0.67      

In [15]:
model = LogisticRegression(C=best_params['C']).fit(train_features + valid_features, train_stances + valid_stances)

In [16]:
score(test_features, test_stances_y)

             precision    recall  f1-score   support

      agree       0.24      0.43      0.31      1903
   disagree       0.33      0.00      0.01       697
    discuss       0.56      0.49      0.52      4464
  unrelated       0.97      0.95      0.96     18349

avg / total       0.82      0.81      0.81     25413

[[  825     4   924   150]
 [  300     3   278   116]
 [ 1974     2  2189   299]
 [  358     0   532 17459]]
score:8252.25


0.8081271503156634

In [17]:
from sklearn.svm import LinearSVC

In [18]:
C = [0.01,0.1,1,10,100]
best_params = {
    'score':-1
}

for c in C:
    model = LinearSVC(C=c, class_weight='balanced').fit(train_features,train_stances)
    sc = score(X=valid_features, y=valid_stances_y)
    if best_params['score'] < sc:
        best_params['score'] = sc
        best_params['C'] = c
        
print("**********BEST FOUND**********")
print(best_params)

             precision    recall  f1-score   support

      agree       0.76      0.56      0.65       770
   disagree       0.35      0.61      0.45       154
    discuss       0.85      0.83      0.84      1700
  unrelated       0.98      0.99      0.98      7371

avg / total       0.93      0.93      0.92      9995

[[ 432  103  184   51]
 [  34   94   19    7]
 [ 101   66 1404  129]
 [   1    2   45 7323]]
score:3887.5
             precision    recall  f1-score   support

      agree       0.82      0.75      0.78       770
   disagree       0.45      0.69      0.54       154
    discuss       0.91      0.90      0.90      1700
  unrelated       0.99      0.99      0.99      7371

avg / total       0.96      0.95      0.95      9995

[[ 580   88   86   16]
 [  36  106   10    2]
 [  90   36 1522   52]
 [   2    8   48 7313]]
score:4122.75
             precision    recall  f1-score   support

      agree       0.87      0.85      0.86       770
   disagree       0.51      0.67      

In [19]:
model = LinearSVC(C=best_params['C']).fit(train_features + valid_features, train_stances + valid_stances)

In [20]:
score(test_features, test_stances_y)

             precision    recall  f1-score   support

      agree       0.22      0.45      0.29      1903
   disagree       0.15      0.01      0.01       697
    discuss       0.49      0.49      0.49      4464
  unrelated       0.97      0.90      0.93     18349

avg / total       0.81      0.77      0.78     25413

[[  860    13   907   123]
 [  316     5   274   102]
 [ 2019    11  2179   255]
 [  768     4  1053 16524]]
score:8060.0


0.78352451131568934