In [1]:
# Import the dependencies

import string
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from textblob import TextBlob, Word
import spacy

import re

import sqlite3

In [2]:
# Ignore warnings

import warnings
warnings.filterwarnings("ignore")

In [3]:
import scraping_config

### Load the data

In [4]:
conn = sqlite3.connect(scraping_config.db_file)
cursor = conn.cursor()
cursor.execute("""
            SELECT title, article FROM {}
            """.format(scraping_config.db_table))
raw_article_list = cursor.fetchall()
cursor.close()

In [5]:
# Load the raw articles into the articles dataframe

articles = pd.DataFrame(raw_article_list)
articles.columns = ["title","article"]
articles.head()

Unnamed: 0,title,article
0,Robinhood's Tenev: 'We stand with the people m...,\nRobinhood CEO Vlad Tenev defended the stock ...
1,"Stock market news live updates: Stocks fall, l...","\nStocks sank Friday, closing out the week and..."
2,Robinhood's Tenev: 'We stand with the people m...,\nRobinhood CEO Vlad Tenev defended the stock ...
3,Robinhood's Tenev: 'We stand with the people m...,\nRobinhood CEO Vlad Tenev defended the stock ...
4,Stock market news live updates: Stock futures ...,"\nStocks sank Friday, closing out the week and..."


In [6]:
# Remove newline characters from beginning of article text
articles.article = [x[1:] for x in articles.article]
articles.head()

Unnamed: 0,title,article
0,Robinhood's Tenev: 'We stand with the people m...,Robinhood CEO Vlad Tenev defended the stock tr...
1,"Stock market news live updates: Stocks fall, l...","Stocks sank Friday, closing out the week and m..."
2,Robinhood's Tenev: 'We stand with the people m...,Robinhood CEO Vlad Tenev defended the stock tr...
3,Robinhood's Tenev: 'We stand with the people m...,Robinhood CEO Vlad Tenev defended the stock tr...
4,Stock market news live updates: Stock futures ...,"Stocks sank Friday, closing out the week and m..."


In [7]:
articles.shape

(652, 2)

In [8]:
articles.isnull().sum()

title      0
article    0
dtype: int64

In [9]:
def to_lower_case(df,column):
    
    lowercase = df[column].apply(lambda x: x.lower())
    
    return lowercase

In [10]:
def join_chars(text):
    
    if not isinstance(text,str):
        print("This")
    # print(f"type: {type(text)} \noffending text --> {text}")
    
    joined = "".join([char for char in text if char not in string.punctuation])
    return joined
    # print(f"============================================> \n\n {joined}")                

In [11]:
def remove_punctuation(df,column):
    try:
        rem_punct = df[column].apply(lambda x: join_chars(x))
    except:
        print(f"Exception: {df}")
        
    return rem_punct

In [12]:
def remove_digits(df, column):
    digits = '0123456789'
    
    no_digits = df[column].apply(lambda l: "".join([x for x in l if str(x) not in digits]))
    
    return no_digits

In [13]:
# Insert a space after every period signifying the end of a sentence. Also insert a space between every comma.

# "".join()

def period_space(df, column):
    
    period_space = df[column].apply(lambda x: re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE))

    return period_space

In [14]:
def stop_words():
    sw1 = stopwords.words("english")
    sw2 = open("nlp/stop_words_english.txt").read().splitlines()

    return set(sw1 + sw2)

In [15]:
def remove_stopwords(df,column):
    text_no_sw = df[column].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words()))
    
    return text_no_sw

In [31]:
def lemmatize(df,column):
    nlp = spacy.load('en')
#     lemmatized = df[column].apply(lambda l: " ".join(set([Word(x).lemmatize() for x in l.split()])))
    lemmatized = df[column].apply(lambda x: " ".join(set([token.lemma_ for token in nlp(x)])))
    
    return lemmatized

In [16]:
# Convert articles to lower case
articles.article = to_lower_case(articles,"article")
articles.head()

Unnamed: 0,title,article
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended the stock tr...
1,"Stock market news live updates: Stocks fall, l...","stocks sank friday, closing out the week and m..."
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended the stock tr...
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended the stock tr...
4,Stock market news live updates: Stock futures ...,"stocks sank friday, closing out the week and m..."


In [17]:
articles["len"] = [len(x) for x in articles.article]

In [18]:
# Remove the stopwords from the article text
articles.article = remove_stopwords(articles,"article")
articles["len2"] = [len(x) for x in articles.article]
articles["lendiff"] = articles["len"] - articles["len2"]
articles

Unnamed: 0,title,article,len,len2,lendiff
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,3311,2287,1024
1,"Stock market news live updates: Stocks fall, l...","stocks sank friday, closing week month volatil...",10392,7329,3063
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,3311,2287,1024
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,3311,2287,1024
4,Stock market news live updates: Stock futures ...,"stocks sank friday, closing week month volatil...",10392,7329,3063
...,...,...,...,...,...
647,Election 2020: Investors fear that Trump may r...,"fight white house appears over, nightmare scen...",6062,4069,1993
648,DC locks down over concerns of violence as nat...,"washington — america votes tuesday, washington...",5717,3855,1862
649,Stock market news live updates: Stocks jump wi...,"stocks rose final stretch voting election day,...",9710,6772,2938
650,Stock market news live updates: Stock futures ...,"stocks rose final stretch voting election day,...",9710,6772,2938


In [19]:
# Drop the length columns
articles.drop(columns=["len","len2","lendiff"],inplace=True)
articles

Unnamed: 0,title,article
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
1,"Stock market news live updates: Stocks fall, l...","stocks sank friday, closing week month volatil..."
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
4,Stock market news live updates: Stock futures ...,"stocks sank friday, closing week month volatil..."
...,...,...
647,Election 2020: Investors fear that Trump may r...,"fight white house appears over, nightmare scen..."
648,DC locks down over concerns of violence as nat...,"washington — america votes tuesday, washington..."
649,Stock market news live updates: Stocks jump wi...,"stocks rose final stretch voting election day,..."
650,Stock market news live updates: Stock futures ...,"stocks rose final stretch voting election day,..."


In [20]:
# Remove punctuation

articles["article"] = remove_punctuation(articles,"article")
articles

Unnamed: 0,title,article
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
1,"Stock market news live updates: Stocks fall, l...",stocks sank friday closing week month volatile...
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...
4,Stock market news live updates: Stock futures ...,stocks sank friday closing week month volatile...
...,...,...
647,Election 2020: Investors fear that Trump may r...,fight white house appears over nightmare scena...
648,DC locks down over concerns of violence as nat...,washington — america votes tuesday washington ...
649,Stock market news live updates: Stocks jump wi...,stocks rose final stretch voting election day ...
650,Stock market news live updates: Stock futures ...,stocks rose final stretch voting election day ...


In [24]:
# Remove digits

articles.article = remove_digits(articles,"article")
articles

Unnamed: 0,title,article,article_period_space
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
1,"Stock market news live updates: Stocks fall, l...",stocks sank friday closing week month volatile...,(stocks sank friday closing week month volatil...
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
4,Stock market news live updates: Stock futures ...,stocks sank friday closing week month volatile...,(stocks sank friday closing week month volatil...
...,...,...,...
647,Election 2020: Investors fear that Trump may r...,fight white house appears over nightmare scena...,(fight white house appears over nightmare scen...
648,DC locks down over concerns of violence as nat...,washington — america votes tuesday washington ...,(washington — america votes tuesday washington...
649,Stock market news live updates: Stocks jump wi...,stocks rose final stretch voting election day ...,(stocks rose final stretch voting election day...
650,Stock market news live updates: Stock futures ...,stocks rose final stretch voting election day ...,(stocks rose final stretch voting election day...


In [25]:
# Make sure there is at least one space after full-stops

articles["article_period_space"] = period_space(articles,"article")
articles

Unnamed: 0,title,article,article_period_space
0,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
1,"Stock market news live updates: Stocks fall, l...",stocks sank friday closing week month volatile...,(stocks sank friday closing week month volatil...
2,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
3,Robinhood's Tenev: 'We stand with the people m...,robinhood ceo vlad tenev defended stock tradin...,(robinhood ceo vlad tenev defended stock tradi...
4,Stock market news live updates: Stock futures ...,stocks sank friday closing week month volatile...,(stocks sank friday closing week month volatil...
...,...,...,...
647,Election 2020: Investors fear that Trump may r...,fight white house appears over nightmare scena...,(fight white house appears over nightmare scen...
648,DC locks down over concerns of violence as nat...,washington — america votes tuesday washington ...,(washington — america votes tuesday washington...
649,Stock market news live updates: Stocks jump wi...,stocks rose final stretch voting election day ...,(stocks rose final stretch voting election day...
650,Stock market news live updates: Stock futures ...,stocks rose final stretch voting election day ...,(stocks rose final stretch voting election day...


In [27]:
articles.article = [x[0] for x in articles.article_period_space]

In [29]:
# Drop the temp article_period_space column
articles.drop(columns=["article_period_space"], inplace=True)

In [32]:
# Lemmatize words
articles.article = lemmatize(articles,"article")

In [33]:
articles["len"] = articles["article"].apply(lambda x: len(x))

Unnamed: 0,title,article,len
0,Robinhood's Tenev: 'We stand with the people m...,key short skyrocket billion ceo fully stem fin...,1404
1,"Stock market news live updates: Stocks fall, l...",include program ceo yield limit tick express p...,3289
2,Robinhood's Tenev: 'We stand with the people m...,key short skyrocket billion ceo fully stem fin...,1404
3,Robinhood's Tenev: 'We stand with the people m...,key short skyrocket billion ceo fully stem fin...,1404
4,Stock market news live updates: Stock futures ...,include program ceo yield limit tick express p...,3289
...,...,...,...
647,Election 2020: Investors fear that Trump may r...,lead appear include accept war count joe nevad...,2537
648,DC locks down over concerns of violence as nat...,lead free cash loudly kushner washington joe s...,2242
649,Stock market news live updates: Stocks jump wi...,lead transportation disaster careful ceo yield...,3065
650,Stock market news live updates: Stock futures ...,lead transportation disaster careful ceo yield...,3065


### Analysis and transformations

### Processing text column

In [31]:
stemmer = SnowballStemmer("english")

def stem(df, column):
    stemmed = df[column].apply(lambda l: " ".join(set([stemmer.stem(x) for x in l.split()])))
    
    return stemmed

In [32]:
stemmer.stem("arabic")

'arab'

In [33]:
# lemmatizer = WordNetLemmatizer()

def lemmatize(df,column):
    nlp = spacy.load('en')
#     lemmatized = df[column].apply(lambda l: " ".join(set([Word(x).lemmatize() for x in l.split()])))
    lemmatized = df[column].apply(lambda x: " ".join(set([token.lemma_ for token in nlp(x)])))
    
    return lemmatized

In [34]:
def clean_articles(args):
    
    df, column = args[0:2]
    df["lowercase"] = df[column].apply(lambda x: x.lower())
    df.drop(columns=[column],inplace=True)
    # df = remove_punctuation(df,"_".join([column,"lowercase"])
    df["add_spaces"] = df["lowercase"].apply(lambda x: re.subn(r'(\s\w+[\.\,])(\w+\s)', '\\1 \\2', x, flags=re.IGNORECASE)[0])
    
    df.drop(columns=["lowercase"],inplace=True)
    df["no_digits"] = remove_digits(df,"add_spaces")

    df.drop(columns=["add_spaces"],inplace=True)
    df["text_no_sw"] = remove_stopwords(df,"no_digits")
    
    df.drop(columns=["no_digits"],inplace=True)
    
    df["lemmatized"] = lemmatize(df,"text_no_sw")
    df.drop(columns=["text_no_sw"],inplace=True)

#     df["stemmed"] = stem(df,"text_no_sw")
#     df.drop(columns=["text_no_sw"],inplace=True)
 
#     df["lemmatized"] = lemmatize(df,"stemmed")
#     df.drop(columns=["stemmed"],inplace=True)
    
    
    return df

In [35]:
all_news.reset_index(inplace=True)

In [36]:
all_news.shape[0]

51233

all_news = all_news.iloc[:1000]

all_news = pd.DataFrame({
    "label": ["FAKE",'REAL'],
    "article": ["THEre ArE4 twenTY5 isn't","1074bradley AVE"]
})
clean_articles([all_news,"article"])

In [37]:
# Parallellize processing - Use 5 cores
# if __name__ == "__main__":

batch_size = 10000
rows = all_news.shape[0]
num_proc = rows // batch_size

args_list = []

for n in range(num_proc):
    start,end = n*batch_size, (n+1)*batch_size
    args = ( all_news.iloc[start:end], "text")
    args_list.append(args)

# print(f"rows_left = {rows % batch_size}")

if rows % batch_size > 0:
    start,end = (num_proc) * batch_size, rows
    args = ( all_news.iloc[start:end], "text")
    args_list.append(args)
    num_proc += 1

# print(num_proc)

p = Pool(processes=6)

data = p.map(clean_articles, args_list)
p.close()

cleaned = pd.DataFrame()

for df in data:
    cleaned = cleaned.append(df)


In [38]:
cleaned

Unnamed: 0,index,title,label,lemmatized
0,26851,"Heading Into The Convention, Clinton Maintain...",FAKE,national / constantly trump black not setback ...
1,31714,IRONY! COLLEGE “ANTI-CAPITALIST” GROUP DISBAND...,FAKE,letter member unproblematized failure swarthmo...
2,12603,"China, Taiwan spar over Chinese diplomat's inv...",REAL,independence control / national monday party s...
3,44853,HIDDEN ORDER: Was the Death of Justice Scalia ...,FAKE,breathing detail member rarity / ap what key c...
4,45071,So That Happened: Did Obama Forget That The GO...,REAL,control -- evolve countertop compromise jason ...
...,...,...,...,...
51228,5320,Trump to spare U.S. 'dreamer' immigrants from ...,REAL,rapid detail meeting play name mexican migrant...
51229,41727,BREAKING…OBAMA’S WAR ON COPS: ANOTHER Cop Ambu...,FAKE,handgun.u.s ballwin historyin letter drug play...
51230,28612,Missouri Republican Laughed At For Saying He’...,FAKE,dollars law.plus columbia rep bible define qua...
51231,14970,Germany probes expenses at Paris embassy; no o...,REAL,letter party friday irregularities accounting ...


### Machine Learning

Since this dataset contains FAKE and REAL news articles, no merge step is required.

The remainging steps, including parsing, stemming or lemmatization, vectorization, and then classification machine learning, that were mentioned above, are all steps that would be applied to process this dataset. 

In [39]:
# Feature Matrix creation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()

X = tfidf_vect.fit_transform(cleaned.lemmatized)
# X = tfidf_vect.fit_transform(cleaned.stemmed)

In [40]:
vocab = list(tfidf_vect.vocabulary_)
vocab_set = set(vocab)
file = open("vocab_set.txt","w")
for word in vocab_set:
    file.writelines("".join([word,"\n"]))
file.close()

In [41]:
len(vocab_set)

128065

In [42]:
# Target Series creation

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(cleaned.label)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y)  # Keep default split proportions

In [44]:
X_train.shape, y_train.shape

((38424, 128065), (38424,))

In [45]:
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

LogisticRegression()

In [46]:
y_pred = log_reg.predict(X_test)

In [47]:
confusion_matrix(y_test, y_pred)

array([[6482,  118],
       [ 228, 5981]])

In [53]:
# Save the ML model for use in the prediction algorithm

pkl_filename = "logistic_reg_model.pkl"

with open(pkl_filename, 'wb') as file:
    pickle.dump(log_reg, file)

## APIs

The following three APIs will be used to stream news articles:

* Mediastack API (https://api.mediastack.com)
* Newsapi API (https://newsapi.org)
* NY Times API (https://api.nytimes.com)

For each of the APIs, there is a link (URL) which  is used to retrieve articles. To insert the articles into an SQL database, the response, which comprises the retrieved articles, has to be split up into individual articles which, using prepared statements, are inserted into the database. The process is automated by creating a continuously-running Python app to periodically (hourly/daily/weekly) retrieve apps from the news sites and populate the database. 

In [48]:
cleaned.iloc[30000:40000]

Unnamed: 0,index,title,label,lemmatized
30000,7686,Clinton expresses concern about AT&T-Time Warn...,REAL,up look president reporter tie expect hillary ...
30001,26248,Shocking Ad Shows How Trump’s Racist Campaign...,FAKE,/ another doubt openly oregon surrogate trump ...
30002,13524,"U.S., Britain, France accused of snubbing anti...",REAL,surprised commit control medal december ambass...
30003,24218,"Wikipedia BRUTALLY Trashes Paul Ryan, Adds Hi...",FAKE,porifera shred behavior / pummel extremely spo...
30004,38249,BOMBSHELL: U.S.DEFENSE SECRETARY ADMITS Obama ...,FAKE,answer rep ralph utter release enemy light joh...
...,...,...,...,...
39995,25301,Trump Tries To MANIPULATE Nevada Vote And Get...,FAKE,dishonest play close district exceptionally cl...
39996,17253,"As China's leaders gather, market reform hopes...",REAL,key corporate dramatically power paradigm expl...
39997,20586,Factbox: Irma vs Andrew: How 2017's big hurric...,REAL,/ national destroy push friday km path mexico ...
39998,40830,MUST WATCH: Barack Obama After January 20th…Th...,FAKE,"december enough # ( steve ) anti , - house sym..."
