In [1]:
# Import the dependencies

import string
import pandas as pd
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from multiprocessing import Process, Value, Pool

### Load the data

#### First dataset

In [2]:
# Load the data from the first dataset

ds1_real_news = pd.read_csv("data/1/True.csv")
ds1_fake_news = pd.read_csv("data/1/Fake.csv")

#### Second Dataset

In [3]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

### Analysis and transformations

#### First dataset

In [4]:
# Show the first five rows of the dataset composed of ds1_real_news

ds1_real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [5]:
# Check the dimensions of the ds1_real_news dataframe

ds1_real_news.shape

(21417, 4)

In [6]:
# Check for null values in the ds1_real_news dataframe

ds1_real_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [7]:
# Show the first five rows of the dataset composed of ds1_fake_news

ds1_fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
# Check the dimensions of the ds1_fake_news dataframe

ds1_fake_news.shape

(23481, 4)

In [9]:
# Check for null values in the ds1_fake_news dataframe

ds1_fake_news.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [10]:
# Add a label column to the ds1_real_news dataframe
ds1_real_news["label"] = "REAL"

# Add a label column to the ds1_fake_news dataframe
ds1_fake_news["label"] = "FAKE"

# Merge the two datasets vertically
ds1_news = ds1_real_news.append(ds1_fake_news, ignore_index=True)
ds1_news.shape

(44898, 5)

In [11]:
ds1_news.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",REAL


In [12]:
# Remove the date and subject columns because it does not contain any useful information
ds1_news.drop(["date","subject"],axis=1,inplace=True)
ds1_news.columns

Index(['title', 'text', 'label'], dtype='object')

#### Second Dataset

In [13]:
# Load the data from the second dataset

ds2_news = pd.read_csv("data/2/news.csv")

In [14]:
# Show the first five rows of the second dataset composed of both real and fake news

ds2_news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [15]:
# Check the dimensions of the all_news_2 dataframe

ds2_news.shape

(6335, 4)

In [16]:
# Check for null values in the all_news_2 dataframe

ds2_news.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [17]:
# Drop the Unnamed: 0 column from the news dataframe

ds2_news.drop(columns=["Unnamed: 0"], inplace=True)
ds2_news.columns

Index(['title', 'text', 'label'], dtype='object')

In [18]:
# Merge the first dataset with the second dataset

all_news = ds1_news.append(ds2_news, ignore_index=True)
all_news.head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,REAL


In [19]:
# Show the dimensions of the merged dataset
all_news.shape

(51233, 3)

In [20]:
# Combine the title column and text column

all_news["article"] = all_news.apply(lambda x:x["title"] + x["text"], axis=1)
all_news.head()

Unnamed: 0,title,text,label,article
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,REAL,"As U.S. budget fight looms, Republicans flip t..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,REAL,U.S. military to accept transgender recruits o...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,REAL,Senior U.S. Republican senator: 'Let Mr. Muell...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,REAL,FBI Russia probe helped by Australian diplomat...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,REAL,Trump wants Postal Service to charge 'much mor...


In [21]:
# Drop the title column and text column

all_news.drop(["title","text"], axis=1, inplace=True)
all_news.head()

Unnamed: 0,label,article
0,REAL,"As U.S. budget fight looms, Republicans flip t..."
1,REAL,U.S. military to accept transgender recruits o...
2,REAL,Senior U.S. Republican senator: 'Let Mr. Muell...
3,REAL,FBI Russia probe helped by Australian diplomat...
4,REAL,Trump wants Postal Service to charge 'much mor...


In [22]:
# shuffle the rows of the all_news dataframe 10 times

for i in range(10):
    all_news = shuffle(all_news)

In [23]:
all_news.head()

Unnamed: 0,label,article
22803,FAKE,"Trump Defies Courts, Refuses To Release Guili..."
29241,FAKE,Biden In Mexico: ‘I Almost Feel Obliged To Ap...
36448,FAKE,TRUMP SUPPORTERS TAKE OUT THE TRASH: Radicals ...
7621,REAL,Democrats sue Trump for alleged voter intimida...
51095,REAL,"Dozens dead, including one American, as hostag..."


### Machine Learning

First, the two datasets need to be merged. This would be done in two steps:
- Add a **label** column to each dataset. The column will contain FAKE in the fake dataset and REAL in the real dataset.
- Vertically merge the dataframes, adding the true dataset to the end of the fake dataset.

In order to determine which words and sentences to use in the machine learning algorithm, the title and text columns have to be parsed into their component words. The words are then transformed into a simpler form, either by stemming, which involves truncating words (more or less), or lemmatization, which involves mapping each word to its grammatical source, eg/ bigger and biggest would be transformed to big, and see and saw would be transformed to see. The remaining words are then vectorized and then the vectorized dataset split up into a training set, to train a classification machine learning algorithm, and a test set, to test the predictions of the generated model.



### Processing text column

In [24]:
def to_lower_case(df,column,start,end):
    lc_df = df.loc[start:end]
    lc_df["lowercase"] = df.iloc[start:end][column].apply(lambda x: x.lower())
    
    return lc_df

In [25]:
def remove_punctuation(df,column,start,end):
    rp_df = df.loc[start:end]
    rp_df["lc_rp"] = df.iloc[start:end][column].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))
    
    return rp_df

In [26]:
def remove_stopwords(df,column,start,end):
    nsw_df = df.loc[start:end]
    
    nsw_df["text_no_sw"] = df.iloc[start:end][column].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords.words("english")]))
    
    return nsw_df

In [27]:
#def clean_articles(df,column,start,end):
def clean_articles(args):
    df, column, start, end = args[0:4]
    
    # print(f"Cleaning articles: {start} to {end}")
    df = to_lower_case(df,"article",start,end)
    # print(df.head())
    df = remove_punctuation(df,"lowercase",start,end)
    # print(df.head())
    df = remove_stopwords(df,"lc_rp",start,end)
    # print(df.head())
    
    return df

In [28]:
all_news.reset_index(inplace=True)
all_news.index

RangeIndex(start=0, stop=51233, step=1)

In [29]:
all_news.shape[0]

51233

In [30]:
all_news = all_news.iloc[:113]

In [31]:
# Parallellize processing - Use 5 cores

batch_size = 100
rows = all_news.shape[0]
num_proc = rows // batch_size

args_list = []

for n in range(num_proc):
    start,end = n*batch_size, (n+1)*batch_size
    args = ( all_news, "article", start, end )
    args_list.append(args)

# print(f"rows_left = {rows % batch_size}")
    
if rows % batch_size > 0:
    start,end = (num_proc) * batch_size, rows
    args = ( all_news, "article", start, end )
    args_list.append(args)
    num_proc += 1
    
p = Pool(processes=num_proc)

data = p.map(clean_articles, args_list)
p.close()

cleaned = pd.DataFrame()

for df in data:
    cleaned = cleaned.append(df)
    
cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lc_df["lowercase"] = df.iloc[start:end][column].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nsw_df["text_no_sw"] = df.iloc[start:end][column].apply(lambda x: " ".join([word for word in x.split() if word not in stopwords.words("english")]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vie

Unnamed: 0,index,label,article,lowercase,lc_rp,text_no_sw
0,22803,FAKE,"Trump Defies Courts, Refuses To Release Guili...","trump defies courts, refuses to release guili...",trump defies courts refuses to release guilia...,trump defies courts refuses release guiliani m...
1,29241,FAKE,Biden In Mexico: ‘I Almost Feel Obliged To Ap...,biden in mexico: ‘i almost feel obliged to ap...,biden in mexico ‘i almost feel obliged to apo...,biden mexico ‘i almost feel obliged apologize’...
2,36448,FAKE,TRUMP SUPPORTERS TAKE OUT THE TRASH: Radicals ...,trump supporters take out the trash: radicals ...,trump supporters take out the trash radicals d...,trump supporters take trash radicals disrupt r...
3,7621,REAL,Democrats sue Trump for alleged voter intimida...,democrats sue trump for alleged voter intimida...,democrats sue trump for alleged voter intimida...,democrats sue trump alleged voter intimidation...
4,51095,REAL,"Dozens dead, including one American, as hostag...","dozens dead, including one american, as hostag...",dozens dead including one american as hostage ...,dozens dead including one american hostage sit...
...,...,...,...,...,...,...
108,26330,FAKE,Trump’s Shady Medical Note Was Written By A ‘...,trump’s shady medical note was written by a ‘...,,
109,37371,FAKE,Ten Things We’ve Learned from Hurricane Harvey...,ten things we’ve learned from hurricane harvey...,,
110,18921,REAL,Merkel welcomes 'a lot of material' from Macro...,merkel welcomes 'a lot of material' from macro...,,
111,31976,FAKE,MSNBC HOST Compares Getting Close to Trump to ...,msnbc host compares getting close to trump to ...,,


In [None]:
num_articles = all_news.shape[0]

step = 100
low = 0

for i in range(0,num_articles,step):
    high = i
    
    print(f"from: {low} to {high}")

    low = high
    
print(f"from: {low} to {num_articles}")

### Machine Learning

Since this dataset contains FAKE and REAL news articles, no merge step is required.

The remainging steps, including parsing, stemming or lemmatization, vectorization, and then classification machine learning, that were mentioned above, are all steps that would be applied to process this dataset. 

## APIs

The following three APIs will be used to stream news articles:

* Mediastack API (https://api.mediastack.com)
* Newsapi API (https://newsapi.org)
* NY Times API (https://api.nytimes.com)

For each of the APIs, there is a link (URL) which  is used to retrieve articles. To insert the articles into an SQL database, the response, which comprises the retrieved articles, has to be split up into individual articles which, using prepared statements, are inserted into the database. The process is automated by creating a continuously-running Python app to periodically (hourly/daily/weekly) retrieve apps from the news sites and populate the database. 

In [2]:
10 // 4

2