In [1]:
from pycaret.classification import *

In [2]:
import pandas as pd

real_news = pd.read_csv('real_data.csv')
fake_news = pd.read_csv('fake_data.csv')

Preprocessing

In [3]:
real_news['label'] = 0
fake_news['label'] = 1

In [4]:
news = pd.concat([real_news, fake_news], ignore_index=True)

In [5]:
news.shape

(44898, 778)

In [6]:
news.head()

Unnamed: 0,NOUN,VERB,ADJ,ADV,PRON,PER,ORG,LOC,token_count,emb_0,...,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767,label
0,167,104,61,27,38,9,42,15,882,-0.344786,...,0.96233,1.493958,0.028553,-0.561609,0.039556,0.247863,0.784136,1.055186,-0.011686,0
1,151,103,38,14,25,12,26,11,711,-0.472813,...,0.951301,1.549707,-0.046531,-0.323878,0.079172,0.105633,1.009901,1.334306,0.098103,0
2,89,60,29,6,38,16,24,18,519,-0.352055,...,0.897749,1.369763,0.117088,-0.410743,0.112445,0.203302,1.009904,1.014667,0.049866,0
3,74,47,26,11,18,16,16,30,432,-0.482031,...,1.108111,1.561908,-0.089562,-0.531489,0.287689,0.18643,0.743534,1.143231,0.109753,0
4,227,83,67,33,40,11,64,23,990,-0.395008,...,0.979328,1.472806,0.005544,-0.485526,0.037237,0.079488,0.954526,1.236557,0.041941,0


In [7]:
news.dropna(inplace=True)

In [8]:
duplicates = news[news.duplicated(keep=False)]

In [9]:
duplicates['label'].value_counts()

label
1    5166
0     237
Name: count, dtype: int64

In [10]:
data = news.drop_duplicates(keep=False)

In [11]:
data.shape

(39495, 778)

Balance Dataset

In [12]:
from sklearn.utils import resample

In [13]:
target0 = data[data['label']==0]
target1 = data[data['label']==1]

In [14]:
print(target0.shape, target1.shape)

(21180, 778) (18315, 778)


In [15]:
target0 = resample(target0, replace=True, n_samples=len(target1), random_state=42)

In [16]:
print(target0.shape, target1.shape)

(18315, 778) (18315, 778)


In [17]:
balanced_data = pd.concat([target0, target1])
balanced_data['label'].value_counts()

label
0    18315
1    18315
Name: count, dtype: int64

Normalizer

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()

In [19]:
X = balanced_data.drop('label', axis=1)
y = balanced_data['label']

In [20]:
print(X.shape, y.shape)

(36630, 777) (36630,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [22]:
X_train_scaled = normalizer.fit_transform(X_train)
X_test_scaled = normalizer.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

Train Model

In [26]:
train_data = pd.concat([X_train_scaled, y_train], axis=1)
test_data = pd.concat([X_test_scaled, y_test], axis=1)

In [27]:
print(train_data.shape, test_data.shape)

(29304, 778) (7326, 778)


In [28]:
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [29]:
train_data.head()

Unnamed: 0,NOUN,VERB,ADJ,ADV,PRON,PER,ORG,LOC,token_count,emb_0,...,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767,label
0,0.20384,0.12544,0.03136,0.03136,0.09408,0.0,0.01568,0.01568,0.925118,0.012406,...,-0.017232,-0.017918,-0.003073,0.001308,0.001871,-0.009364,-0.004818,-0.011782,-0.003567,1
1,0.15937,0.113836,0.056918,0.034151,0.018973,0.045534,0.072096,0.037945,0.971401,0.003312,...,-0.006045,-0.00573,-0.000492,0.00054,-0.000797,-0.002718,-0.004061,-0.00382,0.00032,1
2,0.133885,0.164313,0.033471,0.024343,0.106499,0.027386,0.027386,0.039557,0.967622,-0.002166,...,0.002536,0.004577,-0.001113,-0.002376,0.003084,-0.001353,0.001788,0.004076,-0.00033,1
3,0.175322,0.112879,0.033623,0.02882,0.02882,0.045632,0.052837,0.031222,0.972679,-0.000748,...,0.00252,0.003927,-0.000256,-0.001537,0.000592,-8e-05,0.001324,0.002279,0.000457,0
4,0.230981,0.11676,0.027921,0.01523,0.055842,0.005077,0.025383,0.027921,0.961999,-0.001562,...,0.002528,0.003674,-0.000211,-0.001029,0.000718,0.000335,0.002034,0.003893,0.000451,0


In [34]:
experiment = ClassificationExperiment()
experiment.setup(
    data=train_data,
    target='label',
    session_id=123,
    preprocess=False,
    normalize=False,
    transformation=False,
    feature_selection=False,
    remove_outliers=False,
    fix_imbalance=False,
    index=False
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,label
2,Target type,Binary
3,Original data shape,"(29304, 778)"
4,Transformed data shape,"(29304, 778)"
5,Transformed train set shape,"(20512, 778)"
6,Transformed test set shape,"(8792, 778)"
7,Numeric features,777


<pycaret.classification.oop.ClassificationExperiment at 0x269cd2072e0>

In [35]:
best_model = experiment.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9972,0.9998,0.9969,0.9975,0.9972,0.9943,0.9943,2.215
gbc,Gradient Boosting Classifier,0.9969,0.9996,0.9969,0.997,0.9969,0.9939,0.9939,44.386
rf,Random Forest Classifier,0.9966,0.9995,0.9965,0.9967,0.9966,0.9932,0.9932,2.512
ada,Ada Boost Classifier,0.9963,0.9997,0.996,0.9966,0.9963,0.9926,0.9926,8.025
et,Extra Trees Classifier,0.9962,0.9996,0.9961,0.9964,0.9962,0.9925,0.9925,0.541
dt,Decision Tree Classifier,0.994,0.994,0.9941,0.9938,0.994,0.9879,0.9879,1.836
knn,K Neighbors Classifier,0.9932,0.9973,0.9951,0.9913,0.9932,0.9863,0.9864,0.628
svm,SVM - Linear Kernel,0.9904,0.9987,0.9962,0.9849,0.9905,0.9809,0.981,0.078
lr,Logistic Regression,0.9903,0.9986,0.9938,0.987,0.9904,0.9807,0.9807,0.201
qda,Quadratic Discriminant Analysis,0.9879,0.9979,0.9845,0.9912,0.9878,0.9757,0.9758,0.913


In [39]:
experiment.finalize_model(best_model)

In [40]:
experiment.save_model(best_model, 'lightgbm_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('placeholder', None),
                 ('trained_model',
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20, min_child_weight=0.001,
                                 min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                                 num_leaves=31, objective=None, random_state=123,
                                 reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
                                 subsample_for_bin=200000, subsample_freq=0))],
          verbose=False),
 'lightgbm_model.pkl')