In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import string

In [2]:
fake = pd.read_csv("data/Fake.csv")
true = pd.read_csv("data/True.csv")

In [3]:
fake.shape

(23481, 4)

In [4]:
true.shape

(21417, 4)

In [5]:
fake['target'] = 'fake'
true['target'] = 'true'

In [6]:
data = pd.concat([fake, true]).reset_index(drop = True)
data.shape

(44898, 5)

In [7]:
# Shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

In [8]:
data.head(10)

Unnamed: 0,title,text,subject,date,target
0,HYPOCRITES! CHECK OUT Massive Structure Party ...,It s interesting how the media has completely ...,politics,"Jul 24, 2016",fake
1,Not Kidding! Obama’s Dept. Of Education Orders...,This news is shocking and I m glad my kids don...,politics,"Nov 3, 2015",fake
2,BOILER ROOM – Presidential Debate Simulcast Sp...,Tune in to the Alternate Current Radio Network...,US_News,"September 27, 2016",fake
3,Transcripts of Clinton's Wall Street talks rel...,(Reuters) - U.S. Democratic presidential candi...,politicsNews,"October 15, 2016",true
4,Trump Encourages Audience To Call Ted Cruz A ...,Just when you think Donald Trump can t get any...,News,"February 8, 2016",fake
5,U.S. service member killed in Iraq IED blast: ...,WASHINGTON (Reuters) - A U.S. service member f...,worldnews,"October 2, 2017",true
6,Episode #203 – SUNDAY WIRE: ‘The Dotard Effect...,Episode #203 of SUNDAY WIRE SHOW resumes on S...,Middle-east,"September 24, 2017",fake
7,Trump’s First Government Agency Visit: CIA,RT . President Donald Trump spoke at CIA Head...,Middle-east,"January 22, 2017",fake
8,Conservative Questions Loyalty Of Muslim Fath...,This is outrageous and someone needs to put th...,News,"July 29, 2016",fake
9,White House budget chief expects delay in hitt...,WASHINGTON (Reuters) - White House budget chie...,politicsNews,"February 28, 2017",true


In [9]:
data.drop(["date", "title"],axis=1,inplace=True)
data.head()

Unnamed: 0,text,subject,target
0,It s interesting how the media has completely ...,politics,fake
1,This news is shocking and I m glad my kids don...,politics,fake
2,Tune in to the Alternate Current Radio Network...,US_News,fake
3,(Reuters) - U.S. Democratic presidential candi...,politicsNews,true
4,Just when you think Donald Trump can t get any...,News,fake


In [10]:
# Remove punctuation
def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['text'] = data['text'].apply(punctuation_removal)

In [11]:
data.head()

Unnamed: 0,text,subject,target
0,It s interesting how the media has completely ...,politics,fake
1,This news is shocking and I m glad my kids don...,politics,fake
2,Tune in to the Alternate Current Radio Network...,US_News,fake
3,Reuters US Democratic presidential candidate ...,politicsNews,true
4,Just when you think Donald Trump can t get any...,News,fake


In [12]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sreehari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
data.head()

Unnamed: 0,text,subject,target
0,It interesting media completely ignored massiv...,politics,fake
1,This news shocking I glad kids go taxpayer fun...,politics,fake
2,Tune Alternate Current Radio Network ACR anoth...,US_News,fake
3,Reuters US Democratic presidential candidate H...,politicsNews,true
4,Just think Donald Trump get grotesque proves u...,News,fake


In [14]:
# convert to lower case
data['text'] = data['text'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,text,subject,target
0,it interesting media completely ignored massiv...,politics,fake
1,this news shocking i glad kids go taxpayer fun...,politics,fake
2,tune alternate current radio network acr anoth...,US_News,fake
3,reuters us democratic presidential candidate h...,politicsNews,true
4,just think donald trump get grotesque proves u...,News,fake


In [15]:
# Prepare the data for training and testing
X_train,X_test,y_train,y_test = train_test_split(data['text'], data.target, test_size=0.2, random_state=42)

In [16]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])

# Fitting the model
model = pipe.fit(X_train, y_train)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))



accuracy: 98.59%


In [17]:
from joblib import dump

In [18]:
dump(model, 'lr.joblib') 

['lr.joblib']