In [1]:
import json
import pandas as pd

In [4]:
path = "data/essay-corpus.json"
with open(path, 'r', encoding = 'latin-1') as f:
  data = json.load(f)

data_df = pd.DataFrame(data)

In [5]:
claims_df = data_df[['confirmation_bias','claims']].copy()
claims_df['text_id'] = data_df['id'].copy()
claims_df = claims_df.explode('claims')
claims_df['span'] =  claims_df['claims'].apply(lambda x: x['span'])
claims_df['claims'] =  claims_df['claims'].apply(lambda x: x['text'])

In [6]:
majclaims_df = data_df[['id', 'confirmation_bias','major_claim']].copy()
majclaims_df['text_id'] = data_df['id'].copy()
majclaims_df = majclaims_df.explode('major_claim')
majclaims_df['span'] =  majclaims_df['major_claim'].apply(lambda x: x['span'])
majclaims_df['major_claim'] =  majclaims_df['major_claim'].apply(lambda x: x['text'])

In [7]:
premises_df = data_df[['id', 'confirmation_bias','premises']].copy()
premises_df['text_id'] = data_df['id'].copy()
premises_df = premises_df.explode('premises')
premises_df['span'] =  premises_df['premises'].apply(lambda x: x['span'])
premises_df['premises'] =  premises_df['premises'].apply(lambda x: x['text'])

In [8]:
para_df = data_df[['id', 'confirmation_bias','paragraphs']].copy()
para_df = para_df.explode('paragraphs')
para_df['sufficient'] =  para_df['paragraphs'].apply(lambda x: x['sufficient'])
para_df['paragraphs'] =  para_df['paragraphs'].apply(lambda x: x['text'])

In [9]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def assign_scores(x):
    res = analyzer.polarity_scores(x)
    return list(res.values())

premises_df["scores"] =  premises_df["premises"].apply(assign_scores)
premises_df[['neg', 'neu', 'pos', 'comp']] = pd.DataFrame(premises_df.scores.tolist(),
                                                         index = premises_df.index)
majclaims_df["scores"] =  majclaims_df["major_claim"].apply(assign_scores)
majclaims_df[['neg', 'neu', 'pos', 'comp']] = pd.DataFrame(majclaims_df.scores.tolist(),
                                                         index = majclaims_df.index)

claims_df["scores"] =  claims_df["claims"].apply(assign_scores)
claims_df[['neg', 'neu', 'pos', 'comp']] = pd.DataFrame(claims_df.scores.tolist(),
                                                         index = claims_df.index)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Akshita\AppData\Roaming\nltk_data...


premises_df_true = premises_df[premises_df['confirmation_bias']==True]
avg_neg = premises_df_true['neg'].describe()
avg_neu = premises_df_true['neu'].sum() / premises_df_true.shape[0]
avg_pos = premises_df_true['pos'].sum() / premises_df_true.shape[0]
avg_comp = premises_df_true['comp'].sum() / premises_df_true.shape[0]

premises_df_false = premises_df[premises_df['confirmation_bias']==False]
avg_neg_1 = premises_df_false['neg'].describe()
avg_neu_1 = premises_df_false['neu'].sum() / premises_df_true.shape[0]
avg_pos_1 = premises_df_false['pos'].sum() / premises_df_true.shape[0]
avg_comp_1 = premises_df_false['comp'].sum() / premises_df_true.shape[0]

%matplotlib inline

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.scatter(premises_df_false['comp'], premises_df_false['neg'], color = 'red')
ax.scatter(premises_df_true['comp'], premises_df_true['neg'], color = 'blue')
plt.show()

In [10]:
claims = claims_df[['text_id', 'neg', 'neu', 'pos', 'comp']].groupby(['text_id']).sum()
maj_claims = majclaims_df[['text_id', 'neg', 'neu', 'pos', 'comp']].groupby(['text_id']).sum() 
premises = premises_df[['text_id', 'neg', 'neu', 'pos', 'comp']].groupby(['text_id']).sum()

In [11]:
df1 = pd.merge(claims, maj_claims, on='text_id')
df2 = pd.merge(df1, premises, on='text_id')
final_df = pd.merge(df2, data_df[['id', 'confirmation_bias']], left_on='text_id', right_on='id')

In [12]:
final_df['neg'] = (final_df['neg'] + final_df['neg_x'] + final_df['neg_y']) / 3
final_df['pos'] = (final_df['pos'] + final_df['pos_x'] + final_df['pos_y']) / 3
final_df['neu'] = (final_df['neu'] + final_df['neu_x'] + final_df['neu_y']) / 3
final_df['comp'] = (final_df['comp'] + final_df['comp_x'] + final_df['comp_y']) / 3

In [None]:
#final_df = final_df.drop(['neg_x','neg_y', 'pos_x', 'pos_y', 'neu_x', 'neu_y', 'comp_x', 'comp_y'], 1)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

x = final_df[['neg','pos','neu','comp']]
y = final_df['confirmation_bias']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state=109)

from sklearn import svm, metrics
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# print("F1-score",metrics.f1_score(y_test, y_pred))
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# print("Precision:",metrics.precision_score(y_test, y_pred))
# print("Recall:",metrics.recall_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        49
        True       0.60      1.00      0.75        72

    accuracy                           0.60       121
   macro avg       0.30      0.50      0.37       121
weighted avg       0.35      0.60      0.44       121



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Pending:
1. Hyper-parameter Tuning
2. Cross Validation
3. Using proper test/train data
4. Documentation

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, x, y,scoring="f1",cv=10)
print(scores)

[0.75757576 0.7761194  0.76923077 0.76923077 0.76923077 0.76923077
 0.76923077 0.76923077 0.76923077 0.76923077]


Hyper Parameter Tuning - Grid Search

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
 
param_grid = {'C': [50, 10, 1.0, 0.1, 0.01],
              'gamma': ['scale'],
              'kernel': ['poly', 'rbf', 'sigmoid']}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(svm.SVC(), param_grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0, refit = True, verbose = 3)

grid_result = grid_search.fit(X_train, y_train)


Fitting 30 folds for each of 15 candidates, totalling 450 fits


In [29]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.643021 using {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.634688 (0.019106) with: {'C': 50, 'gamma': 'scale', 'kernel': 'poly'}
0.643021 (0.027007) with: {'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}
0.501642 (0.100132) with: {'C': 50, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.637069 (0.011966) with: {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}
0.643021 (0.023650) with: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.504064 (0.099083) with: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.637069 (0.011966) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}
0.637069 (0.011966) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
0.511248 (0.104078) with: {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.637069 (0.011966) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}
0.637069 (0.011966) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
0.637069 (0.011966) with: {'C': 0.1, 'gamma': 'scale', 'kernel': 'sigmoid'}
0.637069 (0.011966) with: {'C': 0.01, 'gamma': 'scale', 

In [30]:
print(grid_result.best_params_)

{'C': 50, 'gamma': 'scale', 'kernel': 'rbf'}


In [31]:
print(grid_result.best_estimator_)

SVC(C=50)


In [32]:
grid_predictions = grid_result.predict(X_test)

print(classification_report(y_test, grid_predictions))

              precision    recall  f1-score   support

       False       0.00      0.00      0.00        49
        True       0.59      0.97      0.73        72

    accuracy                           0.58       121
   macro avg       0.29      0.49      0.37       121
weighted avg       0.35      0.58      0.44       121

