In [66]:
!pip install opendatasets --quiet

In [67]:
import opendatasets as od

In [68]:
od.download('https://www.kaggle.com/datasets/bhavikjikadara/fake-news-detection')

Skipping, found downloaded files in ".\fake-news-detection" (use force=True to force download)


In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split

In [70]:
df = pd.read_csv('fake-news-detection/true.csv')
dff = pd.read_csv('fake-news-detection/fake.csv')

In [71]:
df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [72]:
df.shape

(21417, 4)

In [73]:
df.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [75]:
df['date'].dtype

dtype('O')

In [76]:
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')

In [77]:
dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [78]:
dff.describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [79]:
dff['subject'].value_counts()

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: count, dtype: int64

In [80]:
df['subject'].value_counts()

subject
politicsNews    11272
worldnews       10145
Name: count, dtype: int64

In [81]:
df.isna().value_counts()

title  text   subject  date 
False  False  False    False    21417
Name: count, dtype: int64

In [82]:
dff.isna().value_counts()

title  text   subject  date 
False  False  False    False    23481
Name: count, dtype: int64

In [83]:
df.shape

(21417, 4)

In [84]:
dff.shape

(23481, 4)

In [85]:
df.drop(['date','subject'] , inplace = True ,axis = 1)
dff.drop(['date','subject'] , inplace = True , axis =1)

In [86]:
df['label'] = 1
dff['label'] = 0

In [87]:
def clean_text(text):

    if isinstance(text,str):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        return text
    return text

In [88]:
df['text'] = df['text'].apply(clean_text)
dff['text'] = dff['text'].apply(clean_text)

In [89]:
df['text'].head()

0    washington reuters  the head of a conservative...
1    washington reuters  transgender people will be...
2    washington reuters  the special counsel invest...
3    washington reuters  trump campaign adviser geo...
4    seattlewashington reuters  president donald tr...
Name: text, dtype: object

In [90]:
dff['text'].head()

0    donald trump just couldn t wish all americans ...
1    house intelligence committee chairman devin nu...
2    on friday it was revealed that former milwauke...
3    on christmas day donald trump announced that h...
4    pope francis used his annual christmas day mes...
Name: text, dtype: object

In [91]:
help(str.translate)

Help on method_descriptor:

translate(self, table, /) unbound builtins.str method
    Replace each character in the string using the given translation table.

      table
        Translation table, which must be a mapping of Unicode ordinals to
        Unicode ordinals, strings, or None.

    The table must implement lookup/indexing via __getitem__, for instance a
    dictionary or list.  If this operation raises LookupError, the character is
    left untouched.  Characters mapped to None are deleted.



In [92]:
df_merged = pd.concat([df,dff],axis =0)
df_merged = df_merged.reset_index(drop = True)

In [93]:
df_merged.shape

(44898, 3)

In [94]:
df_merged['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [95]:
df_merged.columns.value_counts()

title    1
text     1
label    1
Name: count, dtype: int64

In [96]:
X = df_merged['text']
y = df_merged['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3 , random_state = 42, stratify = y)

In [97]:
print(y_train.value_counts())
print(y_test.value_counts())

label
0    16436
1    14992
Name: count, dtype: int64
label
0    7045
1    6425
Name: count, dtype: int64


***TF-IDF***

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [99]:
#df_merged['text'] =  
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [100]:
vectorizer.get_feature_names_out()

array(['00', '000', '0000', ..., 'zzzzaaaacccchhh', 'zzzzzzzz', 'émigré'],
      dtype=object)

Its important to have the same number of columns in the tranformed test and train datasets

In [101]:
X_train_tfidf.shape

(31428, 188376)

In [102]:
X_test_tfidf.shape

(13470, 188376)

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,precision_score,recall_score
model = LogisticRegression()
model.fit(X_train_tfidf,y_train)
y_pred = model.predict(X_test_tfidf)

print('F1 score : ', f1_score(y_pred,y_test))
print('Precision score : ', precision_score(y_pred,y_test))
print('Recall score : ', recall_score(y_pred,y_test))

F1 score :  0.9857996430511368
Precision score :  0.9886381322957198
Recall score :  0.982977406375735
