In [2]:
# Import the dependencies

import string
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob, Word
import spacy

from multiprocessing import Process, Value, Pool

import re

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import pickle

In [3]:
# Ignore warnings

import warnings
warnings.filterwarnings("ignore")

### Load the data

#### First dataset

In [4]:
# Load the data from the first dataset

ds1_real_news = pd.read_csv("data/1/True.csv")
ds1_fake_news = pd.read_csv("data/1/Fake.csv")

#### Second Dataset

In [5]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

### Analysis and transformations

#### First dataset

In [6]:
# Show the first five rows of the dataset composed of ds1_real_news

ds1_real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [7]:
# Check the dimensions of the ds1_real_news dataframe

ds1_real_news.shape

(21417, 4)

In [8]:
# Check for null values in the ds1_real_news dataframe

ds1_real_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [9]:
# Show the first five rows of the dataset composed of ds1_fake_news

ds1_fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [10]:
# Check the dimensions of the ds1_fake_news dataframe

ds1_fake_news.shape

(23481, 4)

In [11]:
# Check for null values in the ds1_fake_news dataframe

ds1_fake_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [12]:
# Add a label column to the ds1_real_news dataframe
ds1_real_news["label"] = "REAL"

# Add a label column to the ds1_fake_news dataframe
ds1_fake_news["label"] = "FAKE"

# Merge the two datasets vertically
ds1_news = ds1_real_news.append(ds1_fake_news, ignore_index=True)
ds1_news.shape

(44898, 5)

In [13]:
ds1_news.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",REAL


In [14]:
# Remove the date and subject columns because it does not contain any useful information
ds1_news.drop(["date","subject"],axis=1,inplace=True)
ds1_news.columns

Index(['title', 'text', 'label'], dtype='object')

#### Second Dataset

In [15]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

In [16]:
# Show the first five rows of the second dataset composed of both real and fake news

ds2_news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [17]:
# Check the dimensions of the all_news_2 dataframe

ds2_news.shape

(6335, 4)

In [18]:
# Check for null values in the all_news_2 dataframe

ds2_news.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [19]:
# Drop the Unnamed: 0 column from the news dataframe

ds2_news.drop(columns=["Unnamed: 0"], inplace=True)
ds2_news.columns

Index(['title', 'text', 'label'], dtype='object')

In [20]:
# Merge the first dataset with the second dataset

all_news = ds1_news.append(ds2_news, ignore_index=True)
all_news.head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,REAL


In [21]:
# Show the dimensions of the merged dataset
all_news.shape

(51233, 3)

In [22]:
# shuffle the rows of the all_news dataframe 10 times

for i in range(10):
    all_news = shuffle(all_news)

In [23]:
all_news.head()

Unnamed: 0,title,text,label
28467,Hooray For ‘Justice’: Coal CEO Sentenced To O...,"Six years ago, the Upper Big Branch mine in We...",FAKE
43651,Trump Administration Faces Flood of Lawsuits O...,21st Century Wire says President Trump s Execu...,FAKE
47465,Burnt homes and broken promises: the Jungle ev...,[Graphic: Calais street scene by Harriet Paint...,FAKE
4429,Democrats ask Trump to veto measure repealing ...,"WASHINGTON (Reuters) - In a last-ditch effort,...",REAL
12673,Czech PM designate: EU should not push us over...,PRAGUE (Reuters) - The designated Czech Prime ...,REAL


### Machine Learning

First, the two datasets need to be merged. This would be done in two steps:
- Add a **label** column to each dataset. The column will contain FAKE in the fake dataset and REAL in the real dataset.
- Vertically merge the dataframes, adding the true dataset to the end of the fake dataset.

In order to determine which words and sentences to use in the machine learning algorithm, the title and text columns have to be parsed into their component words. The words are then transformed into a simpler form, either by stemming, which involves truncating words (more or less), or lemmatization, which involves mapping each word to its grammatical source, eg/ bigger and biggest would be transformed to big, and see and saw would be transformed to see. The remaining words are then vectorized and then the vectorized dataset split up into a training set, to train a classification machine learning algorithm, and a test set, to test the predictions of the generated model.



### Processing text column

In [24]:
def to_lower_case(df,column):
    
    lowercase = df[column].apply(lambda x: x.lower())
    
    return lowercase

In [25]:
def join_chars(text):
    
    if not isinstance(text,str):
        print("This")
    # print(f"type: {type(text)} \noffending text --> {text}")
    
    joined = "".join([char for char in text if char not in string.punctuation])
    return joined
    # print(f"============================================> \n\n {joined}")                

In [26]:
def remove_punctuation(df,column):
    try:
        df["lc_rp"] = df[column].apply(lambda x: join_chars(x))
    except:
        print(f"Exception: {df}")
        
    return df

In [27]:
def remove_digits(df, column):
    digits = '0123456789'
    
    no_digits = df[column].apply(lambda l: "".join([x for x in l if str(x) not in digits]))
    
    return no_digits

In [28]:
# Insert a space after every period signifying the end of a sentence. Also insert a space between every comma.

def period_space(df, column):
    
    period_space = df[column].apply(lambda x: print(re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE)))

    return period_space

In [29]:
def stop_words():
    sw1 = stopwords.words("english")
    sw2 = open("nlp/stop_words_english.txt").read().splitlines()

    return set(sw1 + sw2)

In [30]:
Word("youngest").lemmatize()

'youngest'

In [31]:
def remove_stopwords(df,column):
    text_no_sw = df[column].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words()]))
    
    return text_no_sw

In [32]:
stemmer = SnowballStemmer("english")

def stem(df, column):
    stemmed = df[column].apply(lambda l: " ".join(set([stemmer.stem(x) for x in l.split()])))
    
    return stemmed

In [33]:
stemmer.stem("arabic")

'arab'

In [34]:
# lemmatizer = WordNetLemmatizer()

def lemmatize(df,column):
    nlp = spacy.load('en')
#     lemmatized = df[column].apply(lambda l: " ".join(set([Word(x).lemmatize() for x in l.split()])))
    lemmatized = df[column].apply(lambda x: " ".join(set([token.lemma_ for token in nlp(x)])))
    
    return lemmatized

In [35]:
def clean_articles(args):
    
    df, column = args[0:2]
    df["lowercase"] = df[column].apply(lambda x: x.lower())
    df.drop(columns=[column],inplace=True)
    # df = remove_punctuation(df,"_".join([column,"lowercase"])
    df["add_spaces"] = df["lowercase"].apply(lambda x: re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE)[0])
    
    df.drop(columns=["lowercase"],inplace=True)
    df["no_digits"] = remove_digits(df,"add_spaces")

    df.drop(columns=["add_spaces"],inplace=True)
    df["text_no_sw"] = remove_stopwords(df,"no_digits")
    
    df.drop(columns=["no_digits"],inplace=True)
    
    df["lemmatized"] = lemmatize(df,"text_no_sw")
    df.drop(columns=["text_no_sw"],inplace=True)

#     df["stemmed"] = stem(df,"text_no_sw")
#     df.drop(columns=["text_no_sw"],inplace=True)
 
#     df["lemmatized"] = lemmatize(df,"stemmed")
#     df.drop(columns=["stemmed"],inplace=True)
    
    
    return df

In [36]:
all_news.reset_index(inplace=True)

In [37]:
all_news.shape[0]

51233

all_news = all_news.iloc[:1000]

all_news = pd.DataFrame({
    "label": ["FAKE",'REAL'],
    "article": ["THEre ArE4 twenTY5 isn't","1074bradley AVE"]
})
clean_articles([all_news,"article"])

In [38]:
# Parallellize processing - Use 5 cores
# if __name__ == "__main__":

batch_size = 10000
rows = all_news.shape[0]
num_proc = rows // batch_size

args_list = []

for n in range(num_proc):
    start,end = n*batch_size, (n+1)*batch_size
    args = ( all_news.iloc[start:end], "text")
    args_list.append(args)

# print(f"rows_left = {rows % batch_size}")

if rows % batch_size > 0:
    start,end = (num_proc) * batch_size, rows
    args = ( all_news.iloc[start:end], "text")
    args_list.append(args)
    num_proc += 1

# print(num_proc)

p = Pool(processes=6)

data = p.map(clean_articles, args_list)
p.close()

cleaned = pd.DataFrame()

for df in data:
    cleaned = cleaned.append(df)


In [47]:
cleaned = cleaned[["lemmatized","label"]]
cleaned.to_csv("cleaned_dataframe.csv", sep='\t')

### Machine Learning

Since this dataset contains FAKE and REAL news articles, no merge step is required.

The remainging steps, including parsing, stemming or lemmatization, vectorization, and then classification machine learning, that were mentioned above, are all steps that would be applied to process this dataset. 

In [42]:
# Target Series creation

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(cleaned.label)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cleaned.lemmatize,y)  # Keep default split proportions

In [39]:
# Feature Matrix creation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()

X_model = tfidf_vect.fit(cleaned.lemmatized)
X_train = X_model.transform(cleaned.lemmatized)
X_test = X_model.transform(cleaned.lemmatized)

In [40]:
vocab = list(tfidf_vect.vocabulary_)
vocab_set = set(vocab)
file = open("vocab_set.txt","w")
for word in vocab_set:
    file.writelines("".join([word,"\n"]))
file.close()

In [41]:
len(vocab_set)

128065

X_train, X_test, y_train, y_test = train_test_split(X,y)  # Keep default split proportions

In [44]:
X_train.shape, y_train.shape

((38424, 128065), (38424,))

In [45]:
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

LogisticRegression()

In [46]:
y_pred = log_reg.predict(X_test)

In [47]:
confusion_matrix(y_test, y_pred)

array([[6482,  118],
       [ 228, 5981]])

In [53]:
# Save the ML model for use in the prediction algorithm

pkl_filename = "logistic_reg_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(log_reg, file)

## APIs

The following three APIs will be used to stream news articles:

* Mediastack API (https://api.mediastack.com)
* Newsapi API (https://newsapi.org)
* NY Times API (https://api.nytimes.com)

For each of the APIs, there is a link (URL) which  is used to retrieve articles. To insert the articles into an SQL database, the response, which comprises the retrieved articles, has to be split up into individual articles which, using prepared statements, are inserted into the database. The process is automated by creating a continuously-running Python app to periodically (hourly/daily/weekly) retrieve apps from the news sites and populate the database. 

In [48]:
cleaned.iloc[30000:40000]

Unnamed: 0,index,title,label,lemmatized
30000,7686,Clinton expresses concern about AT&T-Time Warn...,REAL,up look president reporter tie expect hillary ...
30001,26248,Shocking Ad Shows How Trump’s Racist Campaign...,FAKE,/ another doubt openly oregon surrogate trump ...
30002,13524,"U.S., Britain, France accused of snubbing anti...",REAL,surprised commit control medal december ambass...
30003,24218,"Wikipedia BRUTALLY Trashes Paul Ryan, Adds Hi...",FAKE,porifera shred behavior / pummel extremely spo...
30004,38249,BOMBSHELL: U.S.DEFENSE SECRETARY ADMITS Obama ...,FAKE,answer rep ralph utter release enemy light joh...
...,...,...,...,...
39995,25301,Trump Tries To MANIPULATE Nevada Vote And Get...,FAKE,dishonest play close district exceptionally cl...
39996,17253,"As China's leaders gather, market reform hopes...",REAL,key corporate dramatically power paradigm expl...
39997,20586,Factbox: Irma vs Andrew: How 2017's big hurric...,REAL,/ national destroy push friday km path mexico ...
39998,40830,MUST WATCH: Barack Obama After January 20th…Th...,FAKE,"december enough # ( steve ) anti , - house sym..."
