In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-news-classification/WELFake_Dataset.csv


In [2]:
import re
from sklearn.metrics import accuracy_score,classification_report

In [3]:
df = pd.read_csv("/kaggle/input/fake-news-classification/WELFake_Dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [6]:
df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


### We can see that missing data is very low compared to total points,so we can drop rows with any NA

In [8]:
df.dropna(axis=0,inplace=True)

In [9]:
df.shape

(71537, 4)

### we will remove the index column

In [10]:
df.drop(["Unnamed: 0"],axis=1,inplace=True)

In [11]:
df.reset_index(drop=True,inplace=True)

In [12]:
df.head()

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1


In [13]:
df.drop_duplicates(inplace=True)
df.shape

(63121, 3)

### Lets start NLP preprocessing

In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text) 
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


In [15]:
df['clean_title']=df['title'].apply(lambda x: clean_text(x))
df['clean_text']=df['text'].apply(lambda x: clean_text(x))

In [16]:
df.head()

Unnamed: 0,title,text,label,clean_title,clean_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,law enforcement on high alert following threat...,no comment is expected from barack obama membe...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,unbelievable obamas attorney general says most...,now most of the demonstrators gathered last ni...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,bobby jindal raised hindu uses story of christ...,a dozen politically active pastors came here f...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,satan russia unvelis an image of its terrifyin...,the rs sarmat missile dubbed satan will replac...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,about time christian group sues amazon and spl...,all we can say on this one is it s about time ...


In [17]:
df.drop(["title", "text"], axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,label,clean_title,clean_text
0,1,law enforcement on high alert following threat...,no comment is expected from barack obama membe...
1,1,unbelievable obamas attorney general says most...,now most of the demonstrators gathered last ni...
2,0,bobby jindal raised hindu uses story of christ...,a dozen politically active pastors came here f...
3,1,satan russia unvelis an image of its terrifyin...,the rs sarmat missile dubbed satan will replac...
4,1,about time christian group sues amazon and spl...,all we can say on this one is it s about time ...


### Tokenize the columns

In [19]:
import nltk
from nltk.tokenize import word_tokenize

In [20]:
df['clean_title']=df['clean_title'].apply(lambda x: word_tokenize(x))
df['clean_text']=df['clean_text'].apply(lambda x: word_tokenize(x))

In [21]:
from nltk.corpus import stopwords
all_stopwords = stopwords.words("english")

### Stopwords Removal

In [22]:
df['clean_title']=df['clean_title'].apply(lambda x: [d for d in x if d not in all_stopwords])
df['clean_text']=df['clean_text'].apply(lambda x: [d for d in x if d not in all_stopwords])

In [23]:
df.head()

Unnamed: 0,label,clean_title,clean_text
0,1,"[law, enforcement, high, alert, following, thr...","[comment, expected, barack, obama, members, fy..."
1,1,"[unbelievable, obamas, attorney, general, says...","[demonstrators, gathered, last, night, exercis..."
2,0,"[bobby, jindal, raised, hindu, uses, story, ch...","[dozen, politically, active, pastors, came, pr..."
3,1,"[satan, russia, unvelis, image, terrifying, ne...","[rs, sarmat, missile, dubbed, satan, replace, ..."
4,1,"[time, christian, group, sues, amazon, splc, d...","[say, one, time, someone, sued, southern, pove..."


### let us apply stemmer

In [24]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [25]:
df['clean_text']=df['clean_text'].apply(lambda x: [stemmer.stem(d) for d in x])
df['clean_title']=df['clean_title'].apply(lambda x: [stemmer.stem(d) for d in x])

### Apply contractions

In [26]:
!pip install contractions pyspellchecker

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting pyspellchecker
  Downloading pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading pyspellchecker-0.8.3-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90

In [67]:
import contractions
df['clean_text']=df['clean_text'].apply(lambda x: [contractions.fix(d) for d in x])
df['clean_title']=df['clean_title'].apply(lambda x: [contractions.fix(d) for d in x])

### Lets split data into test and train sets

In [68]:
df['combined_text'] = df['clean_title'].apply(lambda x: ' '.join(x)) + ' ' + df['clean_text'].apply(lambda x: ' '.join(x))

In [69]:
X = df[["combined_text"]].values
y = df[['label']].values

In [70]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=22)

### Let us apply TFID vectorizer

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:
vect = TfidfVectorizer(max_features=10000)

In [73]:
X_train_tfidf = vect.fit_transform(X_train[:,0])
X_test_tfidf = vect.transform(X_test[:,0])

In [74]:
from sklearn.linear_model import LogisticRegression

In [75]:
logModel = LogisticRegression()

In [76]:
logModel.fit(X_train_tfidf,y_train)

  y = column_or_1d(y, warn=True)


In [89]:
y_pred = logModel.predict(X_test_tfidf)

In [90]:
def getReport(y_test,y_pred):
    print("Accuracy : ",accuracy_score(y_test,y_pred))
    print(classification_report(y_test,y_pred))

In [91]:
getReport(y_test,y_pred)

Accuracy :  0.9508910891089108
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      6949
           1       0.95      0.94      0.95      5676

    accuracy                           0.95     12625
   macro avg       0.95      0.95      0.95     12625
weighted avg       0.95      0.95      0.95     12625



In [93]:
from sklearn.svm import LinearSVC

In [94]:
svc = LinearSVC()

In [95]:
svc.fit(X_train_tfidf,y_train)

  y = column_or_1d(y, warn=True)


In [96]:
y_pred2 = svc.predict(X_test_tfidf)

In [97]:
getReport(y_test,y_pred2)

Accuracy :  0.9580990099009901
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      6949
           1       0.96      0.95      0.95      5676

    accuracy                           0.96     12625
   macro avg       0.96      0.96      0.96     12625
weighted avg       0.96      0.96      0.96     12625



In [98]:
getReport(y_test,y_pred)

Accuracy :  0.9508910891089108
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      6949
           1       0.95      0.94      0.95      5676

    accuracy                           0.95     12625
   macro avg       0.95      0.95      0.95     12625
weighted avg       0.95      0.95      0.95     12625



## On Comparing both model we see that SVC is performing slightly better than LogisticRegression

In [101]:
import joblib

joblib.dump(svc,"SVCModel.pkl")

['SVCModel.pkl']

### Lets develop a function

In [140]:
def predict(title,text):
    text = clean_text(text)
    title = clean_text(title)
    tokens = word_tokenize(text)
    tokens_title = word_tokenize(title)
    tokens_list = [d for d in tokens if d not in all_stopwords]
    tokens_list_title = [d for d in tokens_title if d not in all_stopwords]
    tokens_stemmed = [stemmer.stem(d) for d in tokens_list]
    tokens_stemmed_title = [stemmer.stem(d) for d in tokens_list_title]
    final_tokens = [contractions.fix(d) for d in tokens_stemmed]
    final_tokens_title = [contractions.fix(d) for d in tokens_stemmed_title]
    Text = ' '.join(final_tokens_title)+' '+' '.join(final_tokens)
    Text = vect.transform([Text])
    return "fake" if svc.predict(Text)==0 else "real"

In [142]:
title = "Alien Spaceship Landed in New York City, Officials Silent"
text = "Witnesses reported seeing a large unidentified flying object landing near Central Park late last night. Government officials have refused to comment."
result = predict(title, text)
print("Prediction:", result) 

Prediction: fake
