In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DESKTOP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DESKTOP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [28]:
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
import re


In [3]:
#printing stopwords in English
print (stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

Load Dataset


In [4]:
data=pd.read_csv('WELFake_Dataset.csv')
data.shape

(72134, 4)

In [5]:
data.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [6]:
data.rename(columns={'Unnamed: 0': 'id'}, inplace=True)


In [7]:
data.head()

Unnamed: 0,id,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      72134 non-null  int64 
 1   title   71576 non-null  object
 2   text    72095 non-null  object
 3   label   72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


Preprocessing

In [9]:
#printing number of missing values
data.isnull().sum()

id         0
title    558
text      39
label      0
dtype: int64

In [10]:
#handle missing values with empty string
data=data.fillna('')

In [14]:
data.isnull().sum()

id       0
title    0
text     0
label    0
dtype: int64

In [11]:
# separating the data & label
X = data.drop(columns='label', axis=1)
Y = data['label']

In [12]:
print(X)

          id                                              title  \
0          0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1          1                                                      
2          2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3          3  Bobby Jindal, raised Hindu, uses story of Chri...   
4          4  SATAN 2: Russia unvelis an image of its terrif...   
...      ...                                                ...   
72129  72129  Russians steal research on Trump in hack of U....   
72130  72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131  72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132  72132  Trump tussle gives unpopular Mexican leader mu...   
72133  72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  
0      No comment is expected from Barack Obama Membe...  
1         Did they post their votes for Hillary already?  
2       Now, most 

In [13]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 72134, dtype: int64


Stemming:


In [15]:
# Stemming is the process of reducing a word to its Root word

#example:
#actor, actress, acting --> act


port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [17]:
data['title'] = data['title'].apply(stemming)

In [18]:
print(data['title'])

0        law enforc high alert follow threat cop white ...
1                                                         
2        unbeliev obama attorney gener say charlott rio...
3        bobbi jindal rais hindu use stori christian co...
4        satan russia unv imag terrifi new supernuk wes...
                               ...                        
72129    russian steal research trump hack u democrat p...
72130    watch giuliani demand democrat apolog trump ra...
72131         migrant refus leav train refuge camp hungari
72132    trump tussl give unpopular mexican leader much...
72133           goldman sach endors hillari clinton presid
Name: title, Length: 72134, dtype: object


In [20]:
#separating the data and label
X = data['title'].values
Y = data['label'].values

In [21]:
print(X)

['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'
 ''
 'unbeliev obama attorney gener say charlott rioter peac protest home state north carolina video'
 ... 'migrant refus leav train refuge camp hungari'
 'trump tussl give unpopular mexican leader much need shot arm'
 'goldman sach endors hillari clinton presid']


In [22]:
print(Y)

[1 1 1 ... 0 0 1]


In [23]:
Y.shape

(72134,)

In [24]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [25]:
print(X)

  (np.int32(0), np.int32(407))	0.3190180925014663
  (np.int32(0), np.int32(1802))	0.33473541566384035
  (np.int32(0), np.int32(3679))	0.24871262252022117
  (np.int32(0), np.int32(5509))	0.31820565801047196
  (np.int32(0), np.int32(6425))	0.28932771754845743
  (np.int32(0), np.int32(6730))	0.48553136502134386
  (np.int32(0), np.int32(7887))	0.26746434949988324
  (np.int32(0), np.int32(9699))	0.22829788917209384
  (np.int32(0), np.int32(17260))	0.24871262252022117
  (np.int32(0), np.int32(17363))	0.2542650376115143
  (np.int32(0), np.int32(18648))	0.1297506867782943
  (np.int32(0), np.int32(19106))	0.19134939529376566
  (np.int32(2), np.int32(1049))	0.28404017886581956
  (np.int32(2), np.int32(2673))	0.30809679188606154
  (np.int32(2), np.int32(2919))	0.3639616996972358
  (np.int32(2), np.int32(6880))	0.2652283770602196
  (np.int32(2), np.int32(8020))	0.2692285294185893
  (np.int32(2), np.int32(11864))	0.2231406266784195
  (np.int32(2), np.int32(12011))	0.16878852994653004
  (np.int32(2)

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: Logistic Regression


In [27]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Evaluation

In [29]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9193858630668723


In [30]:
# accuracy score on the test data
X_test_prediction = model.predict(X_test)
print("Accuracy:", accuracy_score(X_test_prediction, Y_test))
print("\nClassification Report:\n")
print(classification_report(X_test_prediction, Y_test))

Accuracy: 0.900603035974215

Classification Report:

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      6866
           1       0.91      0.90      0.90      7561

    accuracy                           0.90     14427
   macro avg       0.90      0.90      0.90     14427
weighted avg       0.90      0.90      0.90     14427



Make Prediction


In [31]:
X_new = X_test[3]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [32]:
print(Y_test[2])

1
