In [2]:
# importing necessary libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [3]:
# Read and store the data 
data = pd.read_csv("fake_or_real_news.csv")

In [4]:
# Print data
data

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [5]:
# Create a column "Fake" which tells if the article is fake or not
data['Fake'] = data['label'].apply(lambda x: 0 if x == 'REAL' else 1)

In [6]:
# See the data
data

Unnamed: 0,id,title,text,label,Fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,0
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,0


In [7]:
# Now drop the label column which we don't need anymore
data = data.drop('label', axis = 1)

In [8]:
# Print data
data

Unnamed: 0,id,title,text,Fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",1
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,1
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,0
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",1
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,0
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,0
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,1
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,1
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",0


In [9]:
# Store text, Fake in variables X and y respectively
X, y = data['text'], data['Fake']

In [10]:
#Print X
X

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [11]:
# Print y
y

0       1
1       1
2       0
3       1
4       0
       ..
6330    0
6331    1
6332    1
6333    0
6334    0
Name: Fake, Length: 6335, dtype: int64

In [12]:
# Use train_test_split to divide the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# Initialise TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words= 'english', max_df=0.7)

In [14]:
# Vectorize data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [15]:
# Initialise Classifier and train it on training data
classifierSVC = LinearSVC()
classifierSVC.fit(X_train_vectorized, y_train)



In [16]:
# Measure the accuracy of the classifier
classifierSVC.score(X_test_vectorized, y_test)

0.9392265193370166

In [17]:
# Initialise Classifier and train it on training data
classifierLR = LogisticRegression()
classifierLR.fit(X_train_vectorized, y_train)

In [18]:
# Measure the accuracy of the classifier
classifierLR.score(X_test_vectorized, y_test)

0.920284135753749

In [19]:
# Initialise Classifier and train it on training data
classifierNB = MultinomialNB()
classifierNB.fit(X_train_vectorized, y_train)

In [20]:
# Measure the accuracy of the classifier
classifierNB.score(X_test_vectorized, y_test)

0.829518547750592

In [21]:
# Take a sample text from test data and store in a file
with open ("text.txt", "w", encoding= "utf-8") as f:
    f.write(X_test.iloc[10])

In [22]:
# Retrieve the text from the file
with open ("text.txt", "r", encoding= "utf-8") as f:
    text = f.read()

In [23]:
# Print the text
text

'Polling and caucus sites opened Saturday morning in five states in which rivals to Hillary Clinton and Donald Trump will try to slow the Democratic and Republican front-runners\' march to their respective party’s presidential nomination.\n\nRepublicans are caucusing in Kansas, Kentucky and Maine, while Democrats are caucusing in Kansas and Nebraska. Republicans and Democrats also are voting Saturday in the Louisiana primary.\n\nMaine Democrats caucus on Sunday, while voters in both parties go to the polls in the Puerto Rico primary.\n\nTexas GOP Sen. Ted Cruz is hoping to do well in Kansas, Kentucky and Maine caucuses, and the Louisiana primary. A good showing would help him secure his position as the No. 2 GOP candidate ahead of Florida Sen. Marco Rubio. But neither appears to have a path toward winning enough delegates or the nomination.\n\n"Being a conservative cannot just be about how loud you\'re willing to scream ... or about how many names you call people," Rubio said Saturday 

In [24]:
# Vectorize the text retrieved
vectorized_text = vectorizer.transform([text])

In [25]:
# Predict whether it is fake or not using the classifier
classifierLR.predict(vectorized_text)

array([0], dtype=int64)

In [26]:
# Predict whether it is fake or not using the classifier
classifierSVC.predict(vectorized_text)[0]

0

In [27]:
# Verify if it is correct or not using the actual data
y_test.iloc[10]

0

In [28]:
import joblib
joblib.dump(classifierSVC, 'classifier.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [29]:
data.iloc[4]['text']

'It\'s primary day in New York and front-runners Hillary Clinton and Donald Trump are leading in the polls.\n\nTrump is now vowing to win enough delegates to clinch the Republican nomination and prevent a contested convention. But Sens.Ted Cruz, R-Texas, Bernie Sanders, D-Vt., and Ohio Gov. John Kasich and aren\'t giving up just yet.\n\nA big win in New York could tip the scales for both the Republican and Democratic front-runners in this year\'s race for the White House. Clinton and Trump have each suffered losses in recent contests, shifting the momentum to their rivals.\n\n"We have won eight out of the last nine caucuses and primaries! Cheer!" Sanders recently told supporters.\n\nWhile wins in New York for Trump and Clinton are expected, the margins of those victories are also important.\n\nTrump needs to capture more than 50 percent of the vote statewide if he wants to be positioned to win all of the state\'s 95 GOP delegates. That would put him one step closer to avoiding a contes

In [30]:
data.iloc[0]['text']

