In [148]:
import csv
import os
import sys
import random
import string
import numpy as np
import pandas as pd
import pyspark
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SQLContext
from pyspark.rdd import RDD
from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc, size, max, abs

# Initialize a spark session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

## Data Preprocessing 
- Data sampling 
- Data cleaning 
- Data preparation and formatting
- Feature extraction

#### Imbalanced Sampling of Data
Label the datasets, remove unncessary fields, sample and save the samples

In [149]:
# Label the data sets, remove unncessary fields, sample and save the samples

# This imbalanced sample will favor the fake label with a ratio of 2 to 1
fake = pd.read_csv("../data/Fake.csv")
fake['label'] = 'Fake'
fake = fake.drop(columns=["date", "subject"])
fake = fake.sample(21000)
true = pd.read_csv("../data/True.csv")
true['label'] = 'True'
true = true.drop(columns=["date", "subject"])
true = true.sample(10000)

# Uncomment and run the cell to save the sample
# fake.to_csv("../data/Imbalanced_Sample1_Fake.csv", index=False, header=None)
# true.to_csv("../data/Imbalanced_Sample1_True.csv", index=False, header=None)

# This imbalanced sample favors the true label with ratio of 2 to 1
fake = pd.read_csv("../data/Fake.csv")
fake['label'] = 'Fake'
fake = fake.drop(columns=["date", "subject"])
fake = fake.sample(11000)
true = pd.read_csv("../data/True.csv")
true['label'] = 'True'
true = true.drop(columns=["date", "subject"])
true = true.sample(20000)

# Uncomment and run the cell to save the sample
# fake.to_csv("../data/Imbalanced_Sample2_Fake.csv", index=False, header=None)
# true.to_csv("../data/Imbalanced_Sample2_True.csv", index=False, header=None)

#### Balanced Sampling of Data
Label the datasets, remove unncessary fields, sample and save the samples

In [150]:
# 10% balanced sample
fake = pd.read_csv("../data/Fake.csv")
fake['label'] = 'Fake'
fake = fake.drop(columns=["date", "subject"])
fake = fake.sample(2300)
true = pd.read_csv("../data/True.csv")
true['label'] = 'True'
true = true.drop(columns=["date", "subject"])
true = true.sample(2200)

# Uncomment and run the cell to save the sample
# fake.to_csv("../data/Balanced_Sample1_Fake.csv", index=False, header=None)
# true.to_csv("../data/Balanced_Sample1_True.csv", index=False, header=None)

# Largest(~ 40000) balanced sample
fake = pd.read_csv("../data/Fake.csv")
fake['label'] = 'Fake'
fake = fake.drop(columns=["date", "subject"])
fake = fake.sample(22000)
true = pd.read_csv("../data/True.csv")
true['label'] = 'True'
true = true.drop(columns=["date", "subject"])
true = true.sample(21000)

# Uncomment and run the cell to save the sample
# fake.to_csv("../data/Balanced_Sample2_Fake.csv", index=False, header=None)
# true.to_csv("../data/Balanced_Sample2_True.csv", index=False, header=None)

#### Collecting Stop Words
Stop words were extracted from nltk, python string module and the articles themselves

In [151]:
with open('../data/stop_punc.txt', 'r') as file:
    stop_punc = file.read()
stop_punc = stop_punc.split(',')
stop_punc = list(set(stop_punc))
stop_punc.append(',')

#### Data Cleaning and Preparation

In [164]:
fake_file = '../data/Balanced_Sample2_Fake.csv'
true_file = '../data/Balanced_Sample2_True.csv'

# Filtering datapoints with missing features
fake_rdd = spark.read.csv(fake_file).rdd
fake_rdd = fake_rdd.filter(lambda x: x[0] is not None and x[1] is not None).map(lambda x: (x[0] + ' ' + x[1], x[2])).filter(lambda x: x[1]=='Fake')
num_fake = fake_rdd.count()
true_rdd = spark.read.csv(true_file).rdd
true_rdd = true_rdd.filter(lambda x: x[0] is not None and x[1] is not None).map(lambda x: (x[0] + ' ' + x[1], x[2])).filter(lambda x: x[1]=='True')
num_true = true_rdd.count()

# Tokenizing articles and removing stop words from the article
tokenizer = Tokenizer(inputCol="article", outputCol="words")
fake_rdd = fake_rdd.map(lambda x: Row(article=x[0], label=x[1]))
fake_df = spark.createDataFrame(fake_rdd)
fake_df = tokenizer.transform(fake_df)
fake_rdd = fake_df.rdd.map(lambda x: (x[0], [i for i in x[2] if i not in stop_punc], x[1]))
true_rdd = true_rdd.map(lambda x:Row(article=x[0], label=x[1]))
true_df = spark.createDataFrame(true_rdd)
true_df = tokenizer.transform(true_df)
true_rdd = true_df.rdd.map(lambda x: (x[0], [i for i in x[2] if i not in stop_punc], x[1]))

#### Feature Extraction with TFIDF

In [165]:
num_features = 10

fake_rdd = fake_rdd.map(lambda x: (x[1], x[2])).map(lambda x: Row(words=x[0], label=x[1]))
fake_df = spark.createDataFrame(fake_rdd)
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=num_features)
fake_df = hashingTF.transform(fake_df)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(fake_df)
fake_df = idfModel.transform(fake_df)
fake_rdd = fake_df.rdd.map(lambda x: (x[3], x[1])).map(lambda x: ([np.take(x[0], i) for i in range(np.size(x[0]))], x[1]))

true_rdd = true_rdd.map(lambda x: (x[1], x[2])).map(lambda x: Row(words=x[0], label=x[1]))
true_df = spark.createDataFrame(true_rdd)
hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=num_features)
true_df = hashingTF.transform(true_df)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idfModel = idf.fit(true_df)
true_df = idfModel.transform(true_df)
true_rdd = true_df.rdd.map(lambda x: (x[3], x[1])).map(lambda x: ([np.take(x[0], i) for i in range(np.size(x[0]))], x[1]))

fake = fake_rdd.collect()
true = true_rdd.collect()

### Defining Training and Test Sets Using KFold Cross Validation

In [166]:
kf = KFold(n_splits=5)
fake_data = []
for train_index, test_index in kf.split(fake):
    train = []
    test = []
    for i in train_index:
        train.append(fake[i])
    for i in test_index:
        test.append(fake[i])
    fake_data.append((train, test))
true_data = []
for train_index, test_index in kf.split(true):
    train = []
    test = []
    for i in train_index:
        train.append(true[i])
    for i in test_index:
        test.append(true[i])
    true_data.append((train, test))
data = []
for i in range(len(fake_data)):
    true_data[i][0].extend(fake_data[i][0])
    true_data[i][1].extend(fake_data[i][1])
data = true_data

### Classification Using KNN Classifier

In [167]:
knn = KNeighborsClassifier(n_neighbors=5)
accuracy = []
predict = []
f1 = []
for i in range(len(data)):
    knn.fit([j[0] for j in data[i][0]], [j[1] for j in data[i][0]])
    accuracy.append(knn.score([j[0] for j in data[i][1]], [j[1] for j in data[i][1]]))
    predict.append(knn.predict([j[0] for j in data[i][1]]))
for i in range(len(predict)):
    f1.append(f1_score([j[1] for j in data[i][1]], predict[i].tolist(), pos_label="True"))
average_f1 = sum(f1)/len(f1)

# KNN Result
print('Classification of', num_fake + num_true, 'datapoints containing')
print(num_fake, 'fake datapoints')
print(num_true, 'true datapoints')
print('Using 5NN classifier and 5fold cross validation resulted in average f1 score of', average_f1)

Classification of 41804 datapoints containing
20896 fake datapoints
20908 true datapoints
Using 5NN classifier and 5fold cross validation resulted in average f1 score of 0.9932783433773587


### Classification Using Random Forest Classifier

In [168]:
num_trees = 40
max_depth = 20
rf = RandomForestClassifier(n_estimators=num_trees, max_depth=max_depth)
accuracy = []
predict = []
f1 = []
for i in range(len(data)):
    rf.fit([j[0] for j in data[i][0]], [j[1] for j in data[i][0]])
    accuracy.append(rf.score([j[0] for j in data[i][1]], [j[1] for j in data[i][1]]))
    predict.append(rf.predict([j[0] for j in data[i][1]]))
for i in range(len(predict)):
    f1.append(f1_score([j[1] for j in data[i][1]], predict[i].tolist(), pos_label="True"))
average_f1 = sum(f1)/len(f1)

# Result
print('Classification of', num_fake + num_true, 'datapoints containing: ')
print(num_fake, 'fake datapoints')
print(num_true, 'true datapoints')
print('Using random forest classifier with', num_trees, 'trees, max depth of', max_depth, 'and 5fold cross validation resulted in average f1 score of', average_f1)

Classification of 41804 datapoints containing: 
20896 fake datapoints
20908 true datapoints
Using random forest classifier with 40 trees, max depth of 20 and 5fold cross validation resulted in average f1 score of 0.9998086925268777
