In [1]:
# Import the dependencies

import string
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from textblob import TextBlob, Word
import spacy

from multiprocessing import Process, Value, Pool

import re

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Ignore warnings

import warnings
warnings.filterwarnings("ignore")

### Load the data

#### First dataset

In [3]:
# Load the data from the first dataset

ds1_real_news = pd.read_csv("data/1/news1/True.csv")
ds1_fake_news = pd.read_csv("data/1/news1/Fake.csv")

#### Second Dataset

In [4]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

### Analysis and transformations

#### First dataset

In [5]:
# Show the first five rows of the dataset composed of ds1_real_news

ds1_real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
# Check the dimensions of the ds1_real_news dataframe

ds1_real_news.shape

(21417, 4)

In [7]:
# Check for null values in the ds1_real_news dataframe

ds1_real_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [8]:
# Show the first five rows of the dataset composed of ds1_fake_news

ds1_fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [9]:
# Check the dimensions of the ds1_fake_news dataframe

ds1_fake_news.shape

(23481, 4)

In [10]:
# Check for null values in the ds1_fake_news dataframe

ds1_fake_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [11]:
# Add a label column to the ds1_real_news dataframe
ds1_real_news["label"] = "REAL"

# Add a label column to the ds1_fake_news dataframe
ds1_fake_news["label"] = "FAKE"

# Merge the two datasets vertically
ds1_news = ds1_real_news.append(ds1_fake_news, ignore_index=True)
ds1_news.shape

(44898, 5)

In [12]:
ds1_news.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",REAL


In [13]:
# Remove the date and subject columns because it does not contain any useful information
ds1_news.drop(["date","subject"],axis=1,inplace=True)
ds1_news.columns

Index(['title', 'text', 'label'], dtype='object')

#### Second Dataset

In [14]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

In [15]:
# Show the first five rows of the second dataset composed of both real and fake news

ds2_news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [16]:
# Check the dimensions of the all_news_2 dataframe

ds2_news.shape

(6335, 4)

In [17]:
# Check for null values in the all_news_2 dataframe

ds2_news.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [18]:
# Drop the Unnamed: 0 column from the news dataframe

ds2_news.drop(columns=["Unnamed: 0"], inplace=True)
ds2_news.columns

Index(['title', 'text', 'label'], dtype='object')

In [19]:
# Merge the first dataset with the second dataset

all_news = ds1_news.append(ds2_news, ignore_index=True)
all_news.head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,REAL


In [20]:
# Show the dimensions of the merged dataset
all_news.shape

(51233, 3)

In [21]:
# shuffle the rows of the all_news dataframe 10 times

for i in range(10):
    all_news = shuffle(all_news)

In [22]:
all_news.head()

Unnamed: 0,title,text,label
19652,Togolese to vote on presidential term limits a...,LOME (Reuters) - A bill to limit presidents in...,REAL
33278,FLASHBACK: Valerie Jarrett’s Daughter Hired By...,CNN Announced: The Justice Department and Sup...,FAKE
23845,GOP Lawmaker To Trump: Stop Acting Like A ‘Fr...,Angry constituents have inundated town hall me...,FAKE
13112,"Toll in Sanaa fighting rises to 234 killed, 40...",GENEVA (Reuters) - The toll in fighting in the...,REAL
47033,Obama says he learned of Clinton using private...,President Obama says he first learned from new...,REAL


In [23]:
# Check to make sure that the data is balanced

lenReal = len(all_news.loc[all_news["label"]=="REAL"])
lenFake = len(all_news.loc[all_news["label"]=="FAKE"])

assert( lenReal >= lenFake * 0.75 and lenReal <= lenFake * 1.25)

### Processing text column

In [24]:
 # =======================================================================
# This method transforms a given passage of text to lower case

# Input:
#     text - the text to be transformed to lower case
# Return:
#     the text in lower case
# =======================================================================

def to_lower_case(df,column):
    
    lowercase = df[column].apply(lambda x: x.lower())
    
    return lowercase

In [25]:


def join_chars(text):
    
    if not isinstance(text,str):
        print("This")
    # print(f"type: {type(text)} \noffending text --> {text}")
    
    joined = "".join([char for char in text if char not in string.punctuation])
    return joined               

In [26]:
# =========================================================================
# This method removes the punctuation characters from the text
#
# Input:
#     text - the text from which punctuation is to be removed
# Return:
#     the text with the punctuation removed    
# ========================================================================

def remove_punctuation(df,column):
    try:
        df["lc_rp"] = df[column].apply(lambda x: join_chars(x))
    except:
        print(f"Exception: {df}")
        
    return df

In [27]:
# ========================================================================
# This method removes all the numeric strings
#
# Input: 
#     text - the text from which the numeric strings are going to be removed
# Return:
#     the text with numeric strings removed
# ========================================================================

def remove_digits(df, column):
    digits = '0123456789'
    
    no_digits = df[column].apply(lambda l: "".join([x for x in l if str(x) not in digits]))
    
    #     no_digits = df[column].apply(lambda l: " ".join([w for w in l if not w.isnumeric()]))
    
    return no_digits

In [28]:
# ========================================================================
# This method places a space between periods and commas at the end of a sentence, and
# the beginning of the following sentence.
#
# Input:
#     text - the text to be transformed
# Return:
#     the text with spaces placed after period characters
# ========================================================================

def period_space(df, column):
    
    period_space = df[column].apply(lambda x: print(re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE)))

    return period_space

In [29]:
# ========================================================================
# This method returns a list of customized stopwords from the text
#
# Input:
#     None
# Return:
#     the list of stopwords
# ========================================================================

def stop_words():
    sw1 = stopwords.words("english")
    sw2 = open("nlp/stop_words_english.txt").read().splitlines()

    return set(sw1 + sw2)

In [30]:
# ========================================================================
# This method removes the stopwords from the text
#
# Input:
#     text - the text from which stopwords are to be removed
# Return:
#     the text with the stopwords removed
# ========================================================================

def remove_stopwords(df,column):
    text_no_sw = df[column].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words()]))
    
    return text_no_sw

In [31]:
# ========================================================================
# This method removes the stopwords from the text
#
# Input:
#     df - the articles dataframe
#     column - the dataframe column containing the text to be lemmatized
# Return:
#     a list of lemmatized words
# ========================================================================

def lemmatize(df,column):
    nlp = spacy.load('en')

    lemmatized = df[column].apply(lambda x: " ".join(set([token.lemma_ for token in nlp(x)])))
    
    return lemmatized

In [32]:
# ========================================================================
# This method cleans the articles text by calling the above cleaning 
# methods.
#
# Input:
#     args - argument tuple
# Return:
#     the dataframe with the cleaned articles in the "lemmatized" columns
# ========================================================================

def clean_articles(args):
    
    # Grab the first two items from the args array: the dataframe to clean, and the "text" column
    df, column = args[0:2]
    
    # Convert all the characters of the text column to lower case and place them into the new "lowercase" column
    df["lowercase"] = df[column].apply(lambda x: x.lower())
    # Drop the "text" column from the dataframe
    df.drop(columns=[column],inplace=True)
    
    # Add spaces between periods and commans, and their proceeding characters to the text in the lowercase
    # column to create the "add_spaces"
    df["add_spaces"] = df["lowercase"].apply(lambda x: re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE)[0])
    # Drop the "lowercase" column
    df.drop(columns=["lowercase"],inplace=True)
    
    # Remove all digit characters from the "add_spaces" column to create the "no_digits" column
    df["no_digits"] = remove_digits(df,"add_spaces")
    # Drop the "add_spaces" column
    df.drop(columns=["add_spaces"],inplace=True)
    
    # Remove the stop words from the "no_digits" column to create the "text_no_sw" column
    df["text_no_sw"] = remove_stopwords(df,"no_digits")
    # Drop the "no_digits" column
    df.drop(columns=["no_digits"],inplace=True)
 
    # Lemmatize all the words in the "text_no_sw" column to create the "lemmatized" column
    df["lemmatized"] = lemmatize(df,"text_no_sw")
    # Drop the "text_no_sw" column
    df.drop(columns=["text_no_sw"],inplace=True)
   
    return df

In [33]:
# Reset the index of the all_news dataframe

all_news.reset_index(inplace=True)

In [34]:
# Run six instance of the clean_articles function in parallel
# Parallellize processing - Use 6 cores

# Each thread processes 10,000 rows or less
batch_size = 10000

# Get the number of rows in the data
rows = all_news.shape[0]

# Get the number of processes needed (integer division)
num_proc = rows // batch_size

# Create the args_list list which will be passed to pass on to the clean_articles function
args_list = []

# for each process
for n in range(num_proc):
    
    # Determine the rows of data to be processed by the thread
    start,end = n*batch_size, (n+1)*batch_size
    
    # Create the args list for the thread
    args = ( all_news.iloc[start:end], "text")
    
    # Append the args to args_list
    args_list.append(args)


# If the number of rows is not evenly divisible by the batch size
if rows % batch_size > 0:
    
    # Capture the remaining rows of the data to be processes
    start,end = (num_proc) * batch_size, rows
    args = ( all_news.iloc[start:end], "text")
    args_list.append(args)
    num_proc += 1

# Create a process pool containing <num_proc> processes
p = Pool(processes=num_proc)
#p = Pool(processes=6)

# Clean all the segments of the dataframe (returns a list of dataframes)
data = p.map(clean_articles, args_list)
p.close()

# Create the cleaned dataframe
cleaned = pd.DataFrame()

# Append the cleaned dataframes to the "cleaned" dataframe
for df in data:
    cleaned = cleaned.append(df)


In [35]:
cleaned.head()

Unnamed: 0,index,title,label,lemmatized
0,19652,Togolese to vote on presidential term limits a...,REAL,spill call street death publicly ecowas force ...
1,33278,FLASHBACK: Valerie Jarrett’s Daughter Hired By...,FAKE,check out really representative surprised slap...
2,23845,GOP Lawmaker To Trump: Stop Acting Like A ‘Fr...,FAKE,rep call agitated stop wondering @justinamash ...
3,13112,"Toll in Sanaa fighting rises to 234 killed, 40...",REAL,kill fight risen death united nations hospital...
4,47033,Obama says he learned of Clinton using private...,REAL,rep archive secure comment dampen relate admin...


### Machine Learning

Since this dataset contains FAKE and REAL news articles, no merge step is required.

The remainging steps, including parsing, stemming or lemmatization, vectorization, and then classification machine learning, that were mentioned above, are all steps that would be applied to process this dataset. 

In [36]:
# Vectorize the lemmatized data

tfidf_vect = TfidfVectorizer()

tfidf_vect.fit(cleaned.lemmatized)
X = tfidf_vect.transform(cleaned.lemmatized)

In [37]:
# Determine the length of the vocabulary that was returned in the vectorization

vocab = list(tfidf_vect.vocabulary_)
vocab_set = set(vocab)

len(vocab_set)

128065

In [38]:
# Target Series creation

le = LabelEncoder()
le.fit(cleaned.label)
y = le.transform(cleaned.label)

In [39]:
# Show encoded values of "REAL" and "FAKE"

le.transform(["REAL","FAKE"])

array([1, 0])

In [40]:
# Split the data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(X,y)  # Keep default split proportions

In [41]:
# Show the dimensions of the training feature and target sets

X_train.shape, y_train.shape

((38424, 128065), (38424,))

In [42]:
# Instantiate LogisticRegression and fit a model on the training dataset

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

LogisticRegression()

In [43]:
# Predict the target values of the test dataset

y_pred = log_reg.predict(X_test)

In [44]:
# Show the confusion matrix of the predictions

confusion_matrix(y_test, y_pred)

array([[6545,  122],
       [ 252, 5890]])

In [45]:
# Show the precision, recall, and f1 score values

precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred, pos_label="FAKE")

precision, recall, fscore

(array([0.96292482, 0.97970725]),
 array([0.98170091, 0.95897102]),
 array([0.97222222, 0.96922824]))

In [46]:
# Save the ML model for use in the prediction algorithm

pkl_filename = "logistic_reg_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(log_reg, file)

In [47]:
# Save the tfidf vectorizer for use in the prediction algorithm

vectorizer_filename = "tfidf_vectorizer.pkl"

with open(vectorizer_filename, 'wb') as file:
    pickle.dump(tfidf_vect, file)

In [48]:
# Save the label encoder for use in the prediction algorithm

encoder_filename = "label_encoder.pkl"

with open(encoder_filename, "wb") as file:
    pickle.dump(le, file)