In [1]:
import pandas as pd
import numpy as np

In [2]:
# is_fake_title.csv
# is_sus.csv
# is_toxic.csv
# trfl.csv


dataset = 'is_sus'
news = pd.read_csv(f"data/{dataset}.csv")
news.head()

Unnamed: 0.1,Unnamed: 0,text,is_fake
0,0,Get fucking real dude.,1
1,1,She is as dirty as they come and that crook ...,1
2,2,why did you fuck it up. I could do it all day...,1
3,3,Dude they dont finish enclosing the fucking s...,1
4,4,WTF are you talking about Men? No men thats n...,1


In [3]:
# category
target = 'is_fake'
news = news[['text',target]].dropna()
news.head()

Unnamed: 0,text,is_fake
0,Get fucking real dude.,1
1,She is as dirty as they come and that crook ...,1
2,why did you fuck it up. I could do it all day...,1
3,Dude they dont finish enclosing the fucking s...,1
4,WTF are you talking about Men? No men thats n...,1


In [4]:
news.describe(include='all')

Unnamed: 0,text,is_fake
count,20001,20001.0
unique,14637,
top,#NAME?,
freq,41,
mean,,0.39108
std,,0.488005
min,,0.0
25%,,0.0
50%,,0.0
75%,,1.0


# Defining the tokenizer function

In [9]:
import tensorflow as tf
import transformers
import tqdm
from keras.preprocessing import sequence

#creating a function
def func_tokenizer(tokenizer_name, docs):
    features = []
    for doc in tqdm.tqdm(docs, desc = 'converting documents to features'):
        tokens = tokenizer_name.tokenize(doc)
        ids = tokenizer_name.convert_tokens_to_ids(tokens)
        features.append(ids)
    return features
print("The function is created successfully")

The function is created successfully
The function is created successfully


In [10]:
from sklearn.model_selection import train_test_split
X, y = news['text'], news[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# All hail Roberta from hugging face library of transformers 🤗

In [9]:
#Initialize bert tokenizer
roberta_tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base-openai-detector')

In [10]:
X_train[5777:6000]

527      Yeah! id do her like my nigga did them white c...
11742    dood  seriously  why was 70's cinema so fuckin...
11354    Damn  I'm sitting here checking the score to m...
14444                            well i know who this is..
19668                          I want my mom to _________.
                               ...                        
19297                                                7 1/2
12777    that sucks dude.  so it was just your hard drive?
5279     Ok is it gay day today? or are those some cute...
11624    I always forget about Yelp  but it's pretty da...
9434                                           Damn right!
Name: text, Length: 223, dtype: object

In [11]:
roberta_train_features = func_tokenizer(roberta_tokenizer, X_train)
roberta_test_features = func_tokenizer(roberta_tokenizer, X_test)

converting documents to features: 100%|██████████| 16000/16000 [00:02<00:00, 5552.60it/s]
converting documents to features: 100%|██████████| 4001/4001 [00:00<00:00, 5775.24it/s]


In [12]:
roberta_trg = sequence.pad_sequences(roberta_train_features, maxlen = 500)
roberta_test = sequence.pad_sequences(roberta_test_features, maxlen = 500)


array([[    0,     0,     0, ...,  2721,  5582,     4],
       [    0,     0,     0, ...,   127,  3795,  1592],
       [    0,     0,     0, ...,   328,  4832, 38203],
       ...,
       [    0,     0,     0, ..., 26363,  2099,     4],
       [    0,     0,     0, ...,   995, 20042,   569],
       [    0,     0,     0, ...,   581,   847,  2185]], dtype=int32)

# XGBoost

In [80]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
from xgboost import XGBClassifier


In [81]:
xgb = XGBClassifier(n_estimators = 1000, learning_rate = 0.15, max_depth = 9,
                    eval_metric = 'auc', use_label_encoder=False,objective = 'binary:logistic')


In [82]:
xgb.fit(roberta_trg, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.15, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=9, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [83]:
from sklearn.metrics import accuracy_score, roc_auc_score

xgb_pred = xgb.predict(roberta_test)
xgb_score = accuracy_score(y_test, xgb_pred)
xgb_roc = roc_auc_score(y_test, xgb_pred)
print("The accuracy of XGBOOST is: %0.2f" %xgb_score)
print("The roc_auc score of XGBOOST is: %0.2f" %xgb_roc)

The accuracy of XGBOOST is: 0.95
The roc_auc score of XGBOOST is: 0.95


In [84]:
from sklearn.metrics import classification_report
cr = classification_report(y_test, xgb_pred, output_dict=True)
print(cr)
report_1 = pd.DataFrame(cr).transpose().head()
report_1.to_csv(f'report/{dataset}_report_xgb.csv')

{'0': {'precision': 0.9661876584953508, 'recall': 0.943069306930693, 'f1-score': 0.9544885177453026, 'support': 2424}, '1': {'precision': 0.9155963302752294, 'recall': 0.9492707672796449, 'f1-score': 0.9321295143212952, 'support': 1577}, 'accuracy': 0.9455136215946014, 'macro avg': {'precision': 0.94089199438529, 'recall': 0.9461700371051689, 'f1-score': 0.9433090160332989, 'support': 4001}, 'weighted avg': {'precision': 0.9462470125060652, 'recall': 0.9455136215946014, 'f1-score': 0.9456756838538606, 'support': 4001}}


In [85]:
xgb.save_model(f'model/{dataset}_xgb.bin')

# CatBoost

In [86]:
from catboost import CatBoostClassifier
cb = CatBoostClassifier(eval_metric = 'Accuracy', iterations = 2000, learning_rate = 0.2)

In [87]:
cb.fit(roberta_trg, y_train, verbose = 0)
cb_pred = cb.predict(roberta_test)
cb_score = accuracy_score(y_test, cb_pred)
cb_roc = roc_auc_score(y_test, cb_pred)
print("The accuracy of CatBoost is: %0.2f" %cb_score)
print("The roc_auc score of CatBoost is: %0.2f" %cb_roc)

The accuracy of CatBoost is: 0.93
The roc_auc score of CatBoost is: 0.94


In [88]:
from sklearn.metrics import classification_report
cr2 = classification_report(y_test, cb_pred, output_dict=True)
print(cr2)
report_2 = pd.DataFrame(cr2).transpose().head()
report_2.to_csv(f'report/{dataset}_report_cb.csv')

{'0': {'precision': 0.9695387293298521, 'recall': 0.9191419141914191, 'f1-score': 0.9436679373146971, 'support': 2424}, '1': {'precision': 0.8849089841456254, 'recall': 0.9556119213696893, 'f1-score': 0.9189024390243902, 'support': 1577}, 'accuracy': 0.9335166208447888, 'macro avg': {'precision': 0.9272238567377387, 'recall': 0.9373769177805542, 'f1-score': 0.9312851881695436, 'support': 4001}, 'weighted avg': {'precision': 0.9361817915254218, 'recall': 0.9335166208447888, 'f1-score': 0.933906579953084, 'support': 4001}}


In [89]:
cb.save_model(f'model/{dataset}_cb.bin')