In [328]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
# specify GPU
device = torch.device("cuda")

In [329]:
train_X = pd.read_pickle("./train.pkl")
train_Y = pd.read_pickle("./b.pkl")

In [330]:
train_X['postText'].head()

0    [UK’s response to modern slavery leaving victi...
1                                       [this is good]
2    [The "forgotten" Trump roast: Relive his bruta...
3               [Meet the happiest #dog in the world!]
4    [Tokyo's subway is shut down amid fears over a...
Name: postText, dtype: object

In [331]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [332]:
train_X['targetTitle'].head()

0    ‘Inexcusable’ failures in UK’s response to mod...
1    Donald Trump Appoints Pro-Life Advocate as Ass...
2    The ‘forgotten’ Trump roast: Relive his brutal...
3    Meet The Happiest Dog In The World, Maru The H...
4    Tokyo's subway is shut down amid fears over an...
Name: targetTitle, dtype: object

In [333]:
import re

In [334]:
ps = PorterStemmer()

In [335]:
def preprocess(text, flag=False):
#     filtered_words = []
    text = str(text)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    if flag is True:
        filtered_words = [ps.stem(word) for word in text.split(' ') if word not in stopwords.words('english')]
        filtered_words = ' '.join(filtered_words)
    else:
        filtered_words = text
    return filtered_words

In [336]:
# preprocess(train_X['postText'][0][0])
preprocess(train_X['postText'][0][0], True)

'uk respons modern slaveri leav victim destitut abus go free'

In [337]:
flag = False
postText = list(map(lambda x:preprocess(x[0], flag),train_X['postText']))
targetParagraphs = list(map(lambda x:preprocess(x[0], flag),train_X['targetParagraphs']))
targetTitle = list(map(lambda x:preprocess(x, flag),train_X['targetTitle']))
targetDescription = list(map(lambda x:preprocess(x, flag),train_X['targetDescription']))
targetKeywords = list(map(lambda x:preprocess(x, flag),train_X['targetKeywords']))
# max_seq_len = max(list(map(lambda x: len(x.split(' ')), postText)))

In [362]:
postText[0]

'UKs response to modern slavery leaving victims destitute while abusers go free'

In [363]:
max_seq_len = 100
max_seq_len


100

In [372]:
postText_train = tokenizer.batch_encode_plus(
    postText,
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [373]:
targetParagraphs_train = tokenizer.batch_encode_plus(
    targetParagraphs,
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [374]:
# max_seq_len = max(list(map(lambda x: len(x.split(' ')), targetTitle)))
targetTitle_train = tokenizer.batch_encode_plus(
    targetTitle,
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [375]:
# max_seq_len = max(list(map(lambda x: len(x.split(' ')), targetDescription)))
targetDescription_train = tokenizer.batch_encode_plus(
    targetDescription,
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [376]:
# max_seq_len = max(list(map(lambda x: len(x.split(' ')), targetDescription)))
targetKeywords_train = tokenizer.batch_encode_plus(
    targetKeywords,
    max_length = max_seq_len,
    padding='max_length',
    truncation=True,
    return_token_type_ids=False
)

In [345]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [377]:
postText_train = postText_train['input_ids']
targetParagraphs_train = targetParagraphs_train['input_ids']
targetTitle_train = targetTitle_train['input_ids']
targetDescription_train = targetDescription_train['input_ids']
targetKeywords_train = targetKeywords_train['input_ids']

In [371]:
# len(targetTitle_train)
len(postText_train[0])
# len(targetKeywords_train['input_ids'])

100

In [348]:
from scipy import spatial
def embed_cosine(col1,col2):
    cos_similaritycol = []
    for i in range(len(col1)):
        #print(i)
        val1 = col1.iloc[i]
        val2 = col2.iloc[i]
        similarity = 1 - spatial.distance.cosine(val1, val2)
        cos_similaritycol.append(similarity)
    return cos_similaritycol
    

In [378]:
df = pd.DataFrame({'postText' : postText_train,
                   'targetParagraphs' : targetParagraphs_train,
                   'targetTitle' : targetTitle_train,
                   'targetDescription' : targetDescription_train,
                   'targetKeywords' : targetKeywords_train
                  })
df.head()

Unnamed: 0,postText,targetParagraphs,targetTitle,targetDescription,targetKeywords
0,"[101, 2866, 2015, 3433, 2000, 2715, 8864, 2975...","[101, 5190, 1997, 2715, 8864, 5694, 4033, 4140...","[101, 1999, 10288, 7874, 3085, 15428, 1999, 28...","[101, 1999, 10288, 7874, 3085, 15428, 1999, 19...","[101, 2715, 8864, 2533, 2005, 2147, 1998, 2202..."
1,"[101, 2023, 2003, 2204, 102, 0, 0, 0, 0, 0, 0,...","[101, 2343, 6221, 8398, 2038, 2805, 1996, 2157...","[101, 6221, 8398, 16823, 2015, 4013, 15509, 81...","[101, 2343, 6221, 8398, 2038, 2805, 4013, 1550...","[101, 4841, 2142, 2005, 2166, 2852, 11084, 181..."
2,"[101, 1996, 6404, 8398, 25043, 2128, 3669, 372...","[101, 2043, 1996, 2860, 16584, 2063, 2160, 113...","[101, 1996, 6404, 8398, 25043, 2128, 3669, 372...","[101, 2343, 8398, 2180, 2102, 2022, 2012, 2023...","[101, 8398, 1059, 16257, 2094, 1059, 16257, 20..."
3,"[101, 3113, 1996, 5292, 9397, 10458, 3899, 199...","[101, 23677, 2003, 2763, 2019, 2104, 9153, 185...","[101, 3113, 1996, 5292, 9397, 10458, 3899, 199...","[101, 1996, 3720, 2003, 2055, 26280, 1037, 187...","[101, 26280, 18758, 6077, 25462, 2015, 6519, 7..."
4,"[101, 5522, 2015, 10798, 2003, 3844, 2091, 134...","[101, 2028, 1997, 5522, 2015, 2350, 10798, 201...","[101, 5522, 2015, 10798, 2003, 3844, 2091, 134...","[101, 1996, 5741, 8636, 2029, 6354, 2702, 2781...","[101, 5522, 6342, 2497, 14035, 6979, 24475, 26..."


In [360]:
# len(df['targetParagraphs'])
len(postText_train[0])

35

In [379]:
df_cos = pd.DataFrame({
    'postText_Paragraph_Similarity': embed_cosine(df['postText'], df['targetParagraphs']),
    'postText_Title_Similarity': embed_cosine(df['postText'], df['targetTitle']),
    'postText_Description_Similarity': embed_cosine(df['postText'], df['targetDescription']),
    'postText_keyword_Similarity': embed_cosine(df['postText'], df['targetKeywords']),
    'Paragraph_Title_Similarity':embed_cosine(df['targetParagraphs'], df['targetTitle']),
    'Paragraph_Description_Similarity':embed_cosine(df['targetParagraphs'], df['targetDescription']),
    'Paragraph_targetKeywords_Similarity':embed_cosine(df['targetParagraphs'], df['targetKeywords']),
    'targetTitle_targetDescription_Similarity':embed_cosine(df['targetTitle'], df['targetDescription']),
    'targetTitle_targetKeywords_Similarity':embed_cosine(df['targetTitle'], df['targetKeywords']),
    'targetDescription_targetKeywords_Similarity':embed_cosine(df['targetDescription'], df['targetKeywords'])
    })

In [380]:
df_cos.head()

Unnamed: 0,postText_Paragraph_Similarity,postText_Title_Similarity,postText_Description_Similarity,postText_keyword_Similarity,Paragraph_Title_Similarity,Paragraph_Description_Similarity,Paragraph_targetKeywords_Similarity,targetTitle_targetDescription_Similarity,targetTitle_targetKeywords_Similarity,targetDescription_targetKeywords_Similarity
0,0.168075,0.2536,0.187662,0.366641,0.276058,0.30227,0.177571,0.657753,0.295508,0.218742
1,0.235074,0.640069,0.287657,0.116352,0.440888,0.697394,0.68251,0.503993,0.43864,0.558571
2,0.275484,1.0,0.507935,0.201544,0.275484,0.204334,0.475822,0.507935,0.201544,0.318245
3,0.204898,0.370119,0.43155,0.636495,0.384408,0.216762,0.31037,0.451149,0.234416,0.183145
4,0.503692,1.0,0.485661,0.62591,0.503692,0.619083,0.547978,0.485661,0.62591,0.326155


## For classification

In [381]:
import xgboost as xgb
# !pip install xgboost

In [382]:
train_Y.head()
y = train_Y['truthClass']
y.replace(('no-clickbait', 'clickbait'), (0, 1), inplace=True)

In [383]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(df_cos, y)
for train_index, test_index in sss.split(df_cos, y):
    
    X_train, X_test = df_cos.iloc[train_index], df_cos.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [384]:
xgbmodel = xgb.XGBClassifier( max_depth=5,learning_rate=0.1,n_estimators=50,random_state=1,objective='binary:logistic')
xgbmodel.fit(X_train,y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=50, n_jobs=12, num_parallel_tree=1,
              objective='binary:logistic', random_state=1, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [385]:
ypred = xgbmodel.predict(X_test)

## with removing shit

In [386]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      4434
           1       0.33      0.00      0.00      1428

    accuracy                           0.76      5862
   macro avg       0.54      0.50      0.43      5862
weighted avg       0.65      0.76      0.65      5862



## without

In [266]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       0.76      1.00      0.86      4434
           1       0.33      0.00      0.00      1428

    accuracy                           0.76      5862
   macro avg       0.54      0.50      0.43      5862
weighted avg       0.65      0.76      0.65      5862



In [267]:
print("train score:", xgbmodel.score(X_train, y_train))
print("test score:", xgbmodel.score(X_test, y_test))

train score: 0.7574583211465341
test score: 0.7560559535994541


In [272]:
filename = 'svr.sav'
pickle.dump(xgbmodel, open(filename, 'wb'))

## For regression

In [239]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

In [None]:
train_Y.head()
y = train_Y['truthMedian']

In [244]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=1)
sss.get_n_splits(df_cos, y)
for train_index, test_index in sss.split(df_cos, y):
    
    X_train, X_test = df_cos.iloc[train_index], df_cos.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [245]:
regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
regr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svr',
                 SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2,
                     gamma='scale', kernel='rbf', max_iter=-1, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [246]:
y_pred = regr.predict(X_test)

In [253]:
print("train score:", regr.score(X_train, y_train))
print("test score:", regr.score(X_test, y_test))

train score: 0.004032122775924796
test score: -0.010303779058122409


In [250]:
import sklearn.metrics as skm
def normalized_mean_squared_error(truth, predictions):
    norm = skm.mean_squared_error(truth, np.full(len(truth), np.mean(truth)))
    return skm.mean_squared_error(truth, predictions) / norm

regression_measures = {'Explained variance': skm.explained_variance_score,
                       'Mean absolute error': skm.mean_absolute_error,
                       'Mean squared error': skm.mean_squared_error,
                       'Median absolute error': skm.median_absolute_error,
                       'R2 score': skm.r2_score,
                       'Normalized mean squared error': normalized_mean_squared_error}

In [251]:
def write_result(key, value):
    value = round(value, ndigits=3)##Added by phil
    print(key + ': ' + str(value))

In [252]:
for name in sorted(regression_measures):
        write_result(name,
                     regression_measures[name](y_test, y_pred)
                )

Explained variance: -0.0
Mean absolute error: 0.346
Mean squared error: 0.186
Median absolute error: 0.201
Normalized mean squared error: 1.01
R2 score: -0.01


In [271]:
import pickle
filename = 'svr.sav'
pickle.dump(regr, open(filename, 'wb'))

## LIME stuff

In [255]:
# pip install lime

Collecting limeNote: you may need to restart the kernel to use updated packages.
  Using cached lime-0.2.0.1-py3-none-any.whl

Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [257]:
from lime.lime_text import LimeTextExplainer
class_names = ['no-clickbait', 'clickbait']
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
idx = 83
exp = explainer.explain_instance(X_train[idx], c.predict_proba, num_features=6)
print('Document id: %d' % idx)
print('Probability(christian) =', c.predict_proba([newsgroups_test.data[idx]])[0,1])
print('True class: %s' % class_names[newsgroups_test.y[idx]])

In [269]:
X_train

Unnamed: 0,postText_Paragraph_Similarity,postText_Title_Similarity,postText_Description_Similarity,postText_keyword_Similarity,Paragraph_Title_Similarity,Paragraph_Description_Similarity,Paragraph_targetKeywords_Similarity,targetTitle_targetDescription_Similarity,targetTitle_targetKeywords_Similarity,targetDescription_targetKeywords_Similarity
12143,0.615266,1.000000,0.557225,0.281022,0.615266,0.415135,0.580317,0.557225,0.281022,0.159853
11731,0.232824,0.153995,0.558206,0.377146,0.240438,0.264545,0.575264,0.295915,0.221162,0.321712
3489,0.117497,1.000000,0.373235,0.055293,0.117497,0.147437,0.023730,0.373235,0.055293,0.046211
6676,0.236771,0.313269,0.313269,0.430332,0.183795,0.183795,0.323081,1.000000,0.368015,0.368015
16864,0.149397,0.206328,0.206328,0.408917,0.483936,0.483936,0.402922,1.000000,0.546224,0.546224
...,...,...,...,...,...,...,...,...,...,...
8147,0.309691,1.000000,0.309691,0.059518,0.309691,1.000000,0.346146,0.309691,0.059518,0.346146
18832,0.444726,0.590924,0.812540,0.192351,0.322054,0.534720,0.113796,0.440333,0.218890,0.113772
18884,0.209918,1.000000,0.418189,0.097832,0.209918,0.538340,0.580317,0.418189,0.097832,0.102819
4046,0.256056,0.726650,0.290753,0.444153,0.250910,0.300422,0.189003,0.358773,0.297957,0.244233
