In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(r"https://raw.githubusercontent.com/sriku2412/Personal-projects/main/fake_and_real_news_model/fake_and_real_news.csv")
data.head() 

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [33]:
data.shape

(9900, 2)

In [34]:
# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

nltk.download('stopwords')
nltk.download('punkt')
dict = {'Fake': 0, 'Real': 1}
data['label'] = data['label'].map(dict)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srika\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\srika\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
# removing stopwords
# tokenizing text
from nltk.tokenize import word_tokenize
data['Text'] = data['Text'].apply(word_tokenize)

In [36]:
def preprocessing(text):
    # converting to lower case
    text = [word.lower() for word in text]
    # removing punctuations
    text = [word for word in text if word.isalpha()]
    # removing stopwords
    text = [word for word in text if word not in stopwords.words('english')]
    # lemmatizing
    text = [SnowballStemmer("english").stem(item) for item in text]
    return text

data['Text'] = data['Text'].apply(preprocessing)
data.head()

Unnamed: 0,Text,label
0,"[top, trump, surrog, brutal, stab, back, pathe...",0
1,"[conserv, leader, optimist, common, ground, he...",1
2,"[trump, propos, tax, overhaul, stir, concern, ...",1
3,"[court, forc, ohio, allow, million, illeg, pur...",0
4,"[democrat, say, trump, agre, work, immigr, bil...",1


In [37]:
data['Text'] = data['Text'].apply(lambda x: ' '.join(x))
data.head()

Unnamed: 0,Text,label
0,top trump surrog brutal stab back pathet video...,0
1,conserv leader optimist common ground healthca...,1
2,trump propos tax overhaul stir concern deficit...,1
3,court forc ohio allow million illeg purg voter...,0
4,democrat say trump agre work immigr bill wall ...,1


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit(data['Text'])

In [39]:
data_tf = tfidf.transform(data['Text']).toarray()
data_tf = pd.DataFrame(data_tf)
data_tf.columns = tfidf.get_feature_names_out()

Unnamed: 0,aaa,aaaaackkk,aaaarrgh,aaccord,aaf,aai,aaja,aalbert,aaldef,aaliyah,...,zyklon,zypri,zyri,zz,zzbluecomet,zztain,zzzzaaaacccchhh,zzzzzzzzzzzzz,émigré,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [52]:
data_tf.drop(columns=['label'], inplace=True)
data_tf.head(1)

Unnamed: 0,aaa,aaaaackkk,aaaarrgh,aaccord,aaf,aai,aaja,aalbert,aaldef,aaliyah,...,zuppello,zyklon,zypri,zyri,zz,zzbluecomet,zztain,zzzzaaaacccchhh,zzzzzzzzzzzzz,émigré
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
data = pd.concat([data_tf, data['label']], axis=1)
data.head(1)

Unnamed: 0,aaa,aaaaackkk,aaaarrgh,aaccord,aaf,aai,aaja,aalbert,aaldef,aaliyah,...,zyklon,zypri,zyri,zz,zzbluecomet,zztain,zzzzaaaacccchhh,zzzzzzzzzzzzz,émigré,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fake


In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['label']), data['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [61]:
# naive bayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.9555555555555556

In [62]:
# cross validation
from sklearn.model_selection import cross_val_score
cross_val_score(nb, X_train, y_train, cv=5)

array([0.95820189, 0.95974743, 0.95264404, 0.95895817, 0.95264404])

In [65]:
# hyperparameters tuning 
from sklearn.model_selection import GridSearchCV
grid = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001], 'fit_prior': [True, False], 'class_prior': [None, [0.5, 0.5]]}

model = GridSearchCV(nb, grid, cv=5)
model.fit(X_train, y_train)
model.score(X_val, y_val)

0.9558080808080808

In [66]:
model.best_params_

{'alpha': 1, 'class_prior': None, 'fit_prior': True}

In [67]:
y_pred  = model.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
confusion_matrix(y_test, y_pred)

array([[922,  51],
       [ 37, 970]], dtype=int64)

In [70]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       973
           1       0.95      0.96      0.96      1007

    accuracy                           0.96      1980
   macro avg       0.96      0.96      0.96      1980
weighted avg       0.96      0.96      0.96      1980



In [69]:
accuracy_score(y_test, y_pred)

0.9555555555555556