## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

## Load preprocessed dataset

In [2]:
data = pd.read_csv("PreprocessedRealFake.csv")
data = data[(~data["text"].isna())]

## Split data into test and train

In [3]:
# Split the data
X_train,X_test,y_train,y_test = train_test_split(data['text'], data.target, test_size=0.3, random_state=42)

## Support Vector Machines (SVM)

In [4]:
sv = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', svm.SVC(kernel='linear',
                                 C=1.0,
                                 max_iter=100))])

sv_model = sv.fit(X_train, y_train)

sv_pred = sv_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, sv_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, sv_pred, labels=["real", "fake"]))

Accuracy:  73.33%
Classification Report:

               precision    recall  f1-score   support

        real       0.74      0.73      0.74     16006
        fake       0.73      0.74      0.73     15489

    accuracy                           0.73     31495
   macro avg       0.73      0.73      0.73     31495
weighted avg       0.73      0.73      0.73     31495



## Naive Bayes

In [5]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', MultinomialNB(alpha=1.0,
                                       fit_prior=True,
                                       class_prior=None))])

nb_model = nb.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, nb_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, nb_pred, labels=["real", "fake"]))

Accuracy:  86.66%
Classification Report:

               precision    recall  f1-score   support

        real       0.91      0.81      0.86     16006
        fake       0.83      0.92      0.87     15489

    accuracy                           0.87     31495
   macro avg       0.87      0.87      0.87     31495
weighted avg       0.87      0.87      0.87     31495



## Logistic Regression

In [6]:
lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', LogisticRegression(penalty='l2',
                                            C=1,
                                            solver='lbfgs',
                                            max_iter=1000,
                                            class_weight='balanced'))])

lr_model = lr.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, lr_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, lr_pred, labels=["real", "fake"]))

Accuracy:  95.0%
Classification Report:

               precision    recall  f1-score   support

        real       0.94      0.96      0.95     16006
        fake       0.96      0.94      0.95     15489

    accuracy                           0.95     31495
   macro avg       0.95      0.95      0.95     31495
weighted avg       0.95      0.95      0.95     31495



## K-Nearest Neighbors (KNN)

In [7]:
kn = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('model', KNeighborsClassifier(n_neighbors=5))])

kn_model = kn.fit(X_train, y_train)

kn_pred = kn_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, kn_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, kn_pred, labels=["real", "fake"]))

Accuracy:  80.35%
Classification Report:

               precision    recall  f1-score   support

        real       0.89      0.70      0.78     16006
        fake       0.75      0.91      0.82     15489

    accuracy                           0.80     31495
   macro avg       0.82      0.81      0.80     31495
weighted avg       0.82      0.80      0.80     31495



## Decision Tree

In [8]:
dtc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'gini',
                                           max_depth = 100, 
                                           splitter='best', 
                                           random_state=42))])
dtc_model = dtc.fit(X_train, y_train)

dtc_pred = dtc_model.predict(X_test)

print("Accuracy: ", round(accuracy_score(y_test, dtc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, dtc_pred, labels=["real", "fake"]))

Accuracy:  93.98%
Classification Report:

               precision    recall  f1-score   support

        real       0.93      0.96      0.94     16006
        fake       0.95      0.92      0.94     15489

    accuracy                           0.94     31495
   macro avg       0.94      0.94      0.94     31495
weighted avg       0.94      0.94      0.94     31495



## Random Forest

In [9]:
rfc = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', RandomForestClassifier(n_estimators=100,
                                                  criterion="gini"))])

rfc_model = rfc.fit(X_train, y_train)
rfc_pred = rfc_model.predict(X_test)
print("accuracy:", round(accuracy_score(y_test, rfc_pred)*100,2), "\b%")
print("Classification Report:\n\n", classification_report(y_test, rfc_pred, labels=["real", "fake"]))

accuracy: 96.06%
Classification Report:

               precision    recall  f1-score   support

        real       0.96      0.97      0.96     16006
        fake       0.96      0.96      0.96     15489

    accuracy                           0.96     31495
   macro avg       0.96      0.96      0.96     31495
weighted avg       0.96      0.96      0.96     31495



## Store the model names and their accuracy in a dataframe

In [16]:
res = []
res = pd.DataFrame(res)
res["models"] = ["SVM", "Naive Bayes", "Logistic Regression", "KNN", "Decision Tree", "Random Forest"]
res["accuracy"] = [round(accuracy_score(y_test, sv_pred)*100,2), round(accuracy_score(y_test, nb_pred)*100,2), round(accuracy_score(y_test, lr_pred)*100,2), round(accuracy_score(y_test, kn_pred)*100,2), round(accuracy_score(y_test, dtc_pred)*100,2), round(accuracy_score(y_test, rfc_pred)*100,2)]
res = res.sort_values("accuracy", ascending=False).reset_index(drop=True)
res.to_csv('model_results.csv', index=False)
res

Unnamed: 0,models,accuracy
0,Random Forest,96.06
1,Logistic Regression,95.0
2,Decision Tree,93.98
3,Naive Bayes,86.66
4,KNN,80.35
5,SVM,73.33


## Save the traied models locally

In [15]:
import joblib

joblib.dump(sv_model, 'Trained Models/svm_model.pkl')
joblib.dump(nb_model, 'Trained Models/nb_model.pkl')
joblib.dump(lr_model, 'Trained Models/lr_model.pkl')
joblib.dump(kn_model, 'Trained Models/kn_model.pkl')
joblib.dump(dtc_model, 'Trained Models/dtc_model.pkl')
joblib.dump(rfc_model, 'Trained Models/rfc_model.pkl')

['Trained Models/rfc_model.pkl']