In [34]:
# importing necessary libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [35]:
# Read and store the data 
data = pd.read_csv("fake_or_real_news.csv")

In [36]:
# Print data
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [37]:
# Create a column "Fake" which tells if the article is fake or not
data['Fake'] = data['label'].apply(lambda x: 0 if x == 'REAL' else 1)

In [38]:
# See the data
data

Unnamed: 0,id,title,text,label,Fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,0


In [39]:
# Now drop the label column which we don't need anymore
data = data.drop('label', axis = 1)

In [40]:
# Print data
data

Unnamed: 0,id,title,text,Fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


In [41]:
# Store text, Fake in variables X and y respectively
X, y = data['text'], data['Fake']

In [42]:
#Print X
X

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [43]:
# Print y
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: Fake, Length: 6335, dtype: int64

In [44]:
# Use train_test_split to divide the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [45]:
# Initialise TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words= 'english', max_df=0.7)

In [46]:
# Vectorize data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [47]:
# Initialise Classifier and train it on training data
classifierSVC = LinearSVC()
classifierSVC.fit(X_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [49]:
# Measure the accuracy of the classifier
classifierSVC.score(X_test_vectorized, y_test)

0.930544593528019

In [48]:
# Initialise Classifier and train it on training data
classifierLR = LogisticRegression()
classifierLR.fit(X_train_vectorized, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [50]:
# Measure the accuracy of the classifier
classifierLR.score(X_test_vectorized, y_test)

0.9116022099447514

In [51]:
# Take a sample text from test data and store in a file
with open ("text.txt", "w", encoding= "utf-8") as f:
    f.write(X_test.iloc[10])

In [52]:
# Retrieve the text from the file
with open ("text.txt", "r", encoding= "utf-8") as f:
    text = f.read()

In [53]:
# Print the text
text

"Republican presidential candidate Carly Fiorina said Sunday she could see how competitor\xa0Donald Trump's immigration rhetoric, which has gotten him in some hot water over the past few weeks, resonates with some voters.\n\nAsked on ABC's This Week\xa0whether\xa0she would support him if he became\xa0the nominee, the former Hewlett-Packard CEO\xa0said, “I have been in New Hampshire now for six days, and I have not been asked a single question about Donald Trump.” Last week in the first-in-the-nation primary state, she said Trump didn't represent her or her party.\n\nOn the other hand, she said on ABC, “I think Donald Trump taps into an anger that I hear every day.\xa0People are angry that a commonsense thing like securing the border or ending sanctuary cities is somehow considered extreme. It's not extreme, it's common sense. We need to secure the border.”\n\nWhile decrying illegal immigration in his campaign announcement speech on June 16, Trump called the Mexican immigrants entering 

In [54]:
# Vectorize the text retrieved
vectorized_text = vectorizer.transform([text])

In [56]:
# Predict whether it is fake or not using the classifier
classifierLR.predict(vectorized_text)

array([0], dtype=int64)

In [58]:
# Predict whether it is fake or not using the classifier
classifierSVC.predict(vectorized_text)

array([0], dtype=int64)

In [59]:
# Verify if it is correct or not using the actual data
y_test.iloc[10]

0