In [1]:
import pandas as pd 
import numpy as np 

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.linear_model import LogisticRegression 
import re 
import joblib
import string 

In [3]:
fake = pd.read_csv('fake.csv')

In [4]:
true = pd.read_csv('true.csv')

In [5]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [7]:
true.columns
fake.columns

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [8]:
# set the class label in dataset 
fake['class']=0
true['class']=1

In [9]:
# combine both the dataset

data = pd.concat([fake,true],axis = 0)

In [10]:
data.sample(30)

Unnamed: 0,title,text,subject,date,class
16364,MISSING: TWO FILE ‘BOXES’ OF CLINTON E-MAILS…E...,FBI files reveal missing email boxes in Clin...,Government News,"Oct 6, 2016",0
5946,Two Florida ports cancel plans to ink pacts wi...,MIAMI (Reuters) - Two Florida ports have cance...,politicsNews,"January 27, 2017",1
3418,Trump travel ban's fate hinges on emergency U....,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"June 2, 2017",1
7240,Trump denies trying to get security clearance ...,WASHINGTON (Reuters) - Republican President-el...,politicsNews,"November 16, 2016",1
13050,South Korean President Moon to visit China Dec...,BEIJING (Reuters) - South Korean President Moo...,worldnews,"December 6, 2017",1
19922,Trump to meet with long list of leaders in New...,WASHINGTON (Reuters) - U.S. President Donald T...,worldnews,"September 15, 2017",1
9068,JUST IN: PRESIDENT TRUMP AND FIRST LADY Make S...,"President Trump visits Florida hospital, prai...",politics,16-Feb-18,0
17056,Iraq orders arrest of Kurdistan vice president...,BAGHDAD (Reuters) - Iraq s Supreme Justice Cou...,worldnews,"October 19, 2017",1
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
10262,"Romney offers vote, little else, to Cruz in Re...",WASHINGTON (Reuters) - Republican Mitt Romney ...,politicsNews,"March 18, 2016",1


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44898 entries, 0 to 21416
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   class    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [12]:
# drop the features which are not required

data = data.drop(["title","subject","date"],axis = 1)

In [13]:
data.reset_index(inplace =True)

In [14]:
data.drop(['index'],axis =1,inplace=True)

In [15]:
data.head()

Unnamed: 0,text,class
0,Donald Trump just couldn t wish all Americans ...,0
1,House Intelligence Committee Chairman Devin Nu...,0
2,"On Friday, it was revealed that former Milwauk...",0
3,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis used his annual Christmas Day mes...,0


In [16]:
data.sample(5)

Unnamed: 0,text,class
37505,PARIS (Reuters) - French President Emmanuel Ma...,1
23187,Patrick Henningsen 21st Century Wire Hillary C...,0
1151,Attorney General Jeff Sessions appeared before...,0
41714,(Reuters) - A Spanish Constitutional Court rul...,1
20808,Obama s cowardly backdoor gun confiscation sta...,0


In [17]:
# data cleaning

def clean_text(text):
    # convert into lowercase   
    text = text.lower()                                                                                    
    text = re.sub(r'\[.*?\]', "", text)                                                          
    text = re.sub(r'\W', " ", text)
     # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', "", text)
    # Remove HTML tags
    text = re.sub(r'<.*?>+', "", text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), "", text)
    # Remove New line
    text = re.sub(r'\n', " ", text)
    text = re.sub(r'\w*\d\w*', "", text)
    return text

In [18]:
# apply this function to the text column in dataset

data["text"] = data["text"].apply(clean_text)

In [19]:
# create a X and Y variable for model
x = data["text"]
y = data["class"]

In [20]:
# train test split

xtrain ,xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25,random_state=42) 

In [21]:
xtrain.shape
ytest.shape

(11225,)

In [22]:
# use tfidfvectorizer for converting the categorical data into numerical data

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(xtrain)
xv_test = vectorizer.transform(xtest)

In [23]:
xv_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2285159 stored elements and shape (11225, 94931)>

In [24]:
# logistic regression

lr = LogisticRegression()
lr.fit(xv_train,ytrain)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [25]:
# prediction

prediction = lr.predict(xv_test)
lr.score(xv_test,ytest)

0.9858351893095768

In [26]:
from sklearn.metrics import classification_report
print(classification_report(ytest, prediction))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5895
           1       0.98      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [27]:
from sklearn.tree import  DecisionTreeClassifier

In [28]:
DTC = DecisionTreeClassifier()
DTC.fit(xv_train,ytrain)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [29]:
pred_dtc = DTC.predict(xv_test)

In [30]:
DTC.score(xv_test,ytest)

0.9956347438752784

In [31]:
print(classification_report(ytest,pred_dtc))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5895
           1       1.00      1.00      1.00      5330

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



In [32]:
joblib.dump(vectorizer,"vectorizer.jb")
joblib.dump(lr,"lr_model.jb")


['lr_model.jb']

In [33]:
joblib.dump(DTC,"DTC_model.jb")

['DTC_model.jb']