In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("Liar_Dataset.csv")
df.head()

Unnamed: 0,[ID].json,label,statement,subject(s),speaker,speaker's job title,state info,party affiliation,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,11972.json,TRUE,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview
1,11685.json,FALSE,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference
2,11096.json,FALSE,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video


# Text Cleaning

###Making statement text in lower case

In [4]:
df['statement']=df['statement'].str.lower()
df['statement'].tail()

12782    for the first time in more than a decade, impo...
12783    says donald trump has bankrupted his companies...
12784    john mccain and george bush have "absolutely n...
12785    a new poll shows 62 percent support the presid...
12786    no one claims the report vindicating new jerse...
Name: statement, dtype: object

### Cleaning and removing Stop words of english

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Rami
[nltk_data]     Alrwais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"a, about, above, after, again, against, ain, all, am, an, and, any, are, aren, aren't, as, at, be, because, been, before, being, below, between, both, but, by, can, couldn, couldn't, d, did, didn, didn't, do, does, doesn, doesn't, doing, don, don't, down, during, each, few, for, from, further, had, hadn, hadn't, has, hasn, hasn't, have, haven, haven't, having, he, he'd, he'll, her, here, hers, herself, he's, him, himself, his, how, i, i'd, if, i'll, i'm, in, into, is, isn, isn't, it, it'd, it'll, it's, its, itself, i've, just, ll, m, ma, me, mightn, mightn't, more, most, mustn, mustn't, my, myself, needn, needn't, no, nor, not, now, o, of, off, on, once, only, or, other, our, ours, ourselves, out, over, own, re, s, same, shan, shan't, she, she'd, she'll, she's, should, shouldn, shouldn't, should've, so, some, such, t, than, that, that'll, the, their, theirs, them, themselves, then, there, these, they, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, 

In [8]:
stopwords_list = stopwords.words('english')

Cleaning and removing the above stop words list from the statement of news

In [9]:
STOPWORDS = set(stopwords.words('english'))
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df["statement"] = df["statement"].apply(lambda text: cleaning_stopwords(text))
df["statement"].head()

0    building wall u.s.-mexico border take literall...
1           wisconsin pace double number layoffs year.
2             says john mccain done nothing help vets.
3    suzanne bonamici supports plan cut choice medi...
4    asked reporter whether hes center criminal sch...
Name: statement, dtype: object

### Cleaning and removing punctuations

In [10]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [11]:
df["statement"] = df["statement"].apply(lambda x: cleaning_punctuations(x))
df["statement"].tail()

12782    first time decade imports accounted less half ...
12783    says donald trump bankrupted companies once tw...
12784    john mccain george bush absolutely plan univer...
12785    new poll shows 62 percent support presidents p...
12786    one claims report vindicating new jersey gov c...
Name: statement, dtype: object

### Cleaning and removing repeating characters

In [12]:
import re

In [13]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

In [14]:
df["statement"] = df["statement"].apply(lambda x: cleaning_repeating_char(x))
df["statement"].tail()

12782    first time decade imports acounted les half th...
12783    says donald trump bankrupted companies once tw...
12784    john mcain george bush absolutely plan univers...
12785    new pol shows 62 percent suport presidents pla...
12786    one claims report vindicating new jersey gov c...
Name: statement, dtype: object

### Cleaning and removing email

In [15]:
def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

In [16]:
df["statement"] = df["statement"].apply(lambda x: cleaning_email(x))
df["statement"].tail()

12782    first time decade imports acounted les half th...
12783    says donald trump bankrupted companies once tw...
12784    john mcain george bush absolutely plan univers...
12785    new pol shows 62 percent suport presidents pla...
12786    one claims report vindicating new jersey gov c...
Name: statement, dtype: object

### Getting tokenization of news statement text

In [17]:
from nltk.tokenize import RegexpTokenizer

In [18]:
tokenizer = RegexpTokenizer(r'\w+')
df["statement"] = df["statement"].apply(tokenizer.tokenize)
df["statement"].head()

0    [building, wal, usmexico, border, take, litera...
1      [wisconsin, pace, double, number, layofs, year]
2       [says, john, mcain, done, nothing, help, vets]
3    [suzane, bonamici, suports, plan, cut, choice,...
4    [asked, reporter, whether, hes, center, crimin...
Name: statement, dtype: object

### Applying Stemming

In [19]:
st = nltk.PorterStemmer()
def stemming_on_text(data):
    text = [st.stem(word) for word in data]
    return data

df["statement"] = df["statement"].apply(lambda x: stemming_on_text(x))
df["statement"].head()

0    [building, wal, usmexico, border, take, litera...
1      [wisconsin, pace, double, number, layofs, year]
2       [says, john, mcain, done, nothing, help, vets]
3    [suzane, bonamici, suports, plan, cut, choice,...
4    [asked, reporter, whether, hes, center, crimin...
Name: statement, dtype: object

### Applying Lemmatizer

In [20]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Rami
[nltk_data]     Alrwais\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
lm = nltk.WordNetLemmatizer()
def lemmatizer_on_text(data):
    text = [lm.lemmatize(word) for word in data]
    return data

df["statement"] = df["statement"].apply(lambda x: lemmatizer_on_text(x))
df["statement"].head()

0    [building, wal, usmexico, border, take, litera...
1      [wisconsin, pace, double, number, layofs, year]
2       [says, john, mcain, done, nothing, help, vets]
3    [suzane, bonamici, suports, plan, cut, choice,...
4    [asked, reporter, whether, hes, center, crimin...
Name: statement, dtype: object

### Features extraction from the "Statement of the news"

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(3, 3),
    max_features =5000)

Get_Vec= word_vectorizer.fit_transform(df['statement'].astype('str'))
Get_Vec= Get_Vec.toarray()

vocab1 = word_vectorizer.get_feature_names_out()
Features_vect=pd.DataFrame(np.round(Get_Vec, 1), columns=vocab1)
Features_vect.head()

Unnamed: 0,0 0 0,0 0 1,0 0 campaign,0 0 coments,0 0 confrontation,0 0 facebok,0 0 news,0 0 pres,0 0 spech,0 0 television,...,zin antiamerican marxist,zip code entire,zipo zero colective,zo own without,zombie apocalypse plan,zombies curent us,zone city atlanta,zone live within,zone saved lives,zone texas state
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Drop Statement Colunm

### Encodings of label, speaker, state info

In [24]:
x = pd.Categorical(df['label'])
df['label']=x.codes


In [25]:
import matplotlib
import seaborn as sns

In [26]:
import matplotlib.pyplot as plt
import seaborn as sns

### K fold cross validation and splitting the data five times

In [27]:
from sklearn.model_selection import KFold

In [28]:
kf = KFold(n_splits=5)
i=0
for train, test in kf.split(df):
    i=i+1
    print("KFold Split ",i )
    print("%s %s" % (train, test))
    print(' \n')

KFold Split  1
[ 2558  2559  2560 ... 12784 12785 12786] [   0    1    2 ... 2555 2556 2557]
 

KFold Split  2
[    0     1     2 ... 12784 12785 12786] [2558 2559 2560 ... 5113 5114 5115]
 

KFold Split  3
[    0     1     2 ... 12784 12785 12786] [5116 5117 5118 ... 7670 7671 7672]
 

KFold Split  4
[    0     1     2 ... 12784 12785 12786] [ 7673  7674  7675 ... 10227 10228 10229]
 

KFold Split  5
[    0     1     2 ... 10227 10228 10229] [10230 10231 10232 ... 12784 12785 12786]
 



# Model Training

# Random Forest

In [None]:
from sklearn import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score

ModuleNotFoundError: No module named 'cuml'

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

X = Features_vect.values
y = df['label'].values

kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

model_accuracies = {name: [] for name in models}

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    print(f"\n📂 Fold {fold}")

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        model_accuracies[name].append(acc)
        print(f"{name} Accuracy: {acc:.4f}")

print("\n📊 Average Accuracy per Model:")
for name, accs in model_accuracies.items():
    print(f"{name}: {np.mean(accs):.4f}")




📂 Fold 1
Logistic Regression Accuracy: 0.2170
KNN Accuracy: 0.1654

📂 Fold 2
Logistic Regression Accuracy: 0.2248
KNN Accuracy: 0.2017

📂 Fold 3
Logistic Regression Accuracy: 0.2221
KNN Accuracy: 0.1971

📂 Fold 4
Logistic Regression Accuracy: 0.2210
KNN Accuracy: 0.1819

📂 Fold 5
Logistic Regression Accuracy: 0.1971
KNN Accuracy: 0.1709

📊 Average Accuracy per Model:
Logistic Regression: 0.2164
KNN: 0.1834


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

X = Features_vect.values          # features
y = df['label'].values                    # your target column

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 0
accuracies = []

for train_index, test_index in kf.split(X):
    fold += 1
    print("KFold Split:", fold)

    # Split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create and train model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred)
    accuracies.append(acc)
    print("Accuracy on Fold", fold, ":", acc)
    print()

print("Mean Accuracy across folds:", np.mean(accuracies))


KFold Split: 1


KeyboardInterrupt: 