Importing the Dependencies

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection & Pre-Processing

In [12]:
# loading the data from csv file to a pandas Dataframe
raw_news_data = pd.read_csv('/content/drive/MyDrive/Datasets/WELFake_Dataset.csv')

In [13]:
print(raw_news_data)

       Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1               1                                                NaN   
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  label  
0      No comment is expected from Barack Obama Membe...      1  
1  

In [14]:
# replace the null values with a null string
news_data = raw_news_data.where((pd.notnull(raw_news_data)),'')

In [15]:
# printing the first 5 rows of the dataframe
news_data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [16]:
# checking the number of rows and columns in the dataframe
news_data.shape

(72134, 4)

Label Encoding

In [18]:
# label fake as 0;  real as 1;

news_data.loc[news_data['label'] == 'fake', 'label',] = 0
news_data.loc[news_data['label'] == 'real', 'label',] = 1

fake  -  0

real  -  1

In [20]:
# separating the data as texts and label

X = news_data['text']

Y = news_data['label']

In [21]:
print(X)

0        No comment is expected from Barack Obama Membe...
1           Did they post their votes for Hillary already?
2         Now, most of the demonstrators gathered last ...
3        A dozen politically active pastors came here f...
4        The RS-28 Sarmat missile, dubbed Satan 2, will...
                               ...                        
72129    WASHINGTON (Reuters) - Hackers believed to be ...
72130    You know, because in fantasyland Republicans n...
72131    Migrants Refuse To Leave Train At Refugee Camp...
72132    MEXICO CITY (Reuters) - Donald Trump’s combati...
72133    Goldman Sachs Endorses Hillary Clinton For Pre...
Name: text, Length: 72134, dtype: object


In [22]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 72134, dtype: int64


Splitting the data into training data & test data

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [24]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(72134,)
(57707,)
(14427,)


Feature Extraction

In [25]:
# transform the text data to feature vectors that can be used as input to the Logistic regression

feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [26]:
print(X_train)

3100     President Trump gave the commencement speech f...
63673    What a hero what a guy what a crazy world we l...
12248    VATICAN CITY (Reuters) - Pope Francis and Orth...
5597     This post has been updated.\n\nVeteran counter...
46323    Ivanka Trump has been sort of the female face ...
                               ...                        
25365    usapoliticsnow admin 2016 Election , US News ,...
48056    PRICELESS! MILO DESTROYS Heckling Muslim Woman...
59011    We called it the Yonsei Beach Club. It convene...
67224    ROME (Reuters) - The leader of one of Italy s ...
71530    Scott \nIt’s really amazing to see how little ...
Name: text, Length: 57707, dtype: object


In [27]:
print(X_train_features)

  (0, 162443)	0.2213765709411378
  (0, 188270)	0.06487852383194201
  (0, 40633)	0.06280772929810552
  (0, 194085)	0.08878384405874246
  (0, 192214)	0.10050074533411787
  (0, 83758)	0.07849723964776593
  (0, 119694)	0.14442462111441834
  (0, 75692)	0.06280772929810552
  (0, 191315)	0.06482214980172656
  (0, 112405)	0.06981556520052275
  (0, 68122)	0.11641205407755849
  (0, 74992)	0.06266093516282044
  (0, 152748)	0.027983259411390335
  (0, 88862)	0.08185558194567329
  (0, 163975)	0.07192291950203211
  (0, 188185)	0.04859506629876849
  (0, 157345)	0.16841738255953353
  (0, 58180)	0.1489370514021357
  (0, 51587)	0.06313329637023725
  (0, 174047)	0.06339113303248887
  (0, 86442)	0.06479402116725498
  (0, 53869)	0.12815799616487347
  (0, 181196)	0.07744780123567781
  (0, 132684)	0.03583554103553187
  (0, 11158)	0.0602347440453464
  :	:
  (57706, 48412)	0.1098366026403824
  (57706, 112914)	0.031346441308971795
  (57706, 168914)	0.04451379020313469
  (57706, 114202)	0.06269288261794359
  (577

Training the Model

Logistic Regression

In [28]:
model = LogisticRegression()

In [29]:
# training the Logistic Regression model with the training data
model.fit(X_train_features, Y_train)

Evaluating the trained model

In [30]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [31]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9586185384788674


In [32]:
# prediction on test data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [33]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9442018437651626


Building a Predictive System

In [34]:
input_news = [" "]
# convert text to feature vectors
input_data_features = feature_extraction.transform(input_news)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('real news')

else:
  print('fake news')

[1]
real news


In [35]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Confusion matrix on training data
cm_training_data = confusion_matrix(Y_train, prediction_on_training_data)
print('Confusion Matrix on Training Data:\n', cm_training_data)

# Classification report on training data
cr_training_data = classification_report(Y_train, prediction_on_training_data)
print('\nClassification Report on Training Data:\n', cr_training_data)

# Accuracy on training data
acc_training_data = accuracy_score(Y_train, prediction_on_training_data)
print('\nAccuracy on Training Data:', acc_training_data)

# Confusion matrix on test data
cm_test_data = confusion_matrix(Y_test, prediction_on_test_data)
print('\nConfusion Matrix on Test Data:\n', cm_test_data)

# Classification report on test data
cr_test_data = classification_report(Y_test, prediction_on_test_data)
print('\nClassification Report on Test Data:\n', cr_test_data)

# Accuracy on test data
acc_test_data = accuracy_score(Y_test, prediction_on_test_data)
print('\nAccuracy on Test Data:', acc_test_data)


Confusion Matrix on Training Data:
 [[26567  1454]
 [  934 28752]]

Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.97      0.95      0.96     28021
           1       0.95      0.97      0.96     29686

    accuracy                           0.96     57707
   macro avg       0.96      0.96      0.96     57707
weighted avg       0.96      0.96      0.96     57707


Accuracy on Training Data: 0.9586185384788674

Confusion Matrix on Test Data:
 [[6542  465]
 [ 340 7080]]

Classification Report on Test Data:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      7007
           1       0.94      0.95      0.95      7420

    accuracy                           0.94     14427
   macro avg       0.94      0.94      0.94     14427
weighted avg       0.94      0.94      0.94     14427


Accuracy on Test Data: 0.9442018437651626
