In [14]:
import pandas as pd
import numpy as np
import nltk
import re
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import train_test_split

# Data Preparation and Cleaning

In [15]:
df_fake = pd.read_csv('Fake.csv')
df_true = pd.read_csv('True.csv')
df_fake['true'] = 0
df_true['true'] = 1
df = pd.concat([df_fake, df_true])
df

Unnamed: 0,title,text,subject,date,true
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


#### First we need to identify all of the potential words in out data set. We throw out all stop words, numbers, and non-alphabetic charaters. The top 500 most frequent words become the columns for out dataframe.

In [2]:
wrds = {}
stop_words = set(stopwords.words('english'))
#Set up tokenizer to only grab ASCII characters
tokenizer = nltk.RegexpTokenizer(r"\w+")

for index, row in df.iterrows():
    text = row['text'].lower()
    #Filter for stop words
    text = re.sub(r'\d+', '', text)
    word_tokens = tokenizer.tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    
    #Go through each sig word in text and add frequency to dictionary of words
    for word in filtered_text:
        if '_' in word:
            continue
        if word in wrds:
            wrds[word] = wrds[word] + 1
        else:
            wrds[word] = 1 

wrds = sorted(wrds, key=wrds.get, reverse=True)

In [3]:
master_df = pd.DataFrame(0, index= df.index, columns = wrds[:500])
master_df

Unnamed: 0,trump,said,president,would,u,people,one,state,also,new,...,paul,authorities,bush,front,candidates,december,position,rule,army,capital
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21412,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21413,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21415,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Now we add the frequency of each word to its associated document.

In [None]:
for index, row in df.iterrows():
    text = row['text'].lower()
    #Filter for stop words
    text = re.sub(r'\d+', '', text)
    word_tokens = tokenizer.tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    
    for word in filtered_text:
        if word in wrds[:500]:
            master_df.loc[index, word] = master_df.loc[index, word] + 1 

In [4]:
#master_df.to_csv('500matrix.csv')
master_df = pd.read_csv('500matrix.csv', index_col=0)

Now we format the data as y and X and split between train and test sets

In [5]:
master_df
#Move truth column to be the first column
truth = master_df.pop('true')
master_df.insert(0,'true',truth)

In [11]:
data = master_df.to_numpy()
y = data[:,0]
X = data[:,1:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.93)
print(X_train.shape)

(3142, 500)


# Feature Engineering (Kernel Function)

#### Now we define our kerneling method. We chose to use the document similary kernal as a opposed to the cosine similarity because we're dealing with documents and this kernel will give better results. 

In [12]:
def tf(x):
    return np.log(1+x)

def idf(j):
    return np.log(j.size/(1+np.count_nonzero(j)))

def phi(x, idf):
    result = np.zeros(len(idf))
    for i in range(0,len(idf)):
        result[i] = x[i]*idf[i]
    return result
    
    
# M = np.matrix([[1,2,3,4],
#                [3,4,5,6],
#                [1,2,3,4],
#                [2,2,2,2],
#                [5,6,8,2]])

def doc_sim(M):
    #Apply tf to each element in M
    M = np.vectorize(tf)(M)
    #Create the inverse document frequency array- one value for each word
    idf_v = np.zeros(M.shape[1])
    for j in range (0,M.shape[1]):
        idf_v[j] = idf(M[:,j]) 

    #Create kernel matrix and fill it 
    K = np.zeros((M.shape[0], M.shape[0]))
    for i in range(0, K.shape[0]):
        #print(i)
        for j in range(0, K.shape[1]):
            phi_x = phi(M[i,:], idf_v)
            phi_y = phi(M[j,:], idf_v)
            K[i,j] = (np.transpose(phi_x)@phi_y)/(np.linalg.norm(phi_x)*np.linalg.norm(phi_y))

    return K


In [13]:
K = doc_sim(X_train)

#### and now our data is ready for clustering:

In [16]:
print(K)

[[1.         0.21140809 0.17215689 ... 0.19024861 0.18631429 0.17877855]
 [0.21140809 1.         0.09981792 ... 0.15476411 0.15472406 0.12132032]
 [0.17215689 0.09981792 1.         ... 0.12752245 0.13187313 0.21256131]
 ...
 [0.19024861 0.15476411 0.12752245 ... 1.         0.092153   0.05852533]
 [0.18631429 0.15472406 0.13187313 ... 0.092153   1.         0.0291086 ]
 [0.17877855 0.12132032 0.21256131 ... 0.05852533 0.0291086  1.        ]]
