# Text Mining Project

In [1]:
import pandas as pd
import numpy as np
import math

true_csv = pd.read_csv('./dataset/True.csv')
fake_csv = pd.read_csv('./dataset/Fake.csv')

In [2]:
true_csv.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [3]:
fake_csv.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
N_true = len(true_csv)
print(N_true)

21417


In [5]:
N_fake = len(fake_csv)
print(N_fake)

23481


## Split Data into Training and Testing Data

In [6]:
def getSubset(ind, L, leng): # L: dataframe
    cnt = 0
    K = [[] for i in range(leng)]
    for i in range(len(L)): 
        if i in ind:
            K[cnt] = L[i]
            cnt = cnt + 1
    return K;

In [7]:
import random
rand_N = random.randint(1000, 5000) # total size of test data
rand_true = random.randint(int(rand_N * 0.4), int(rand_N * 0.6)) # size of true
rand_fake = rand_N - rand_true # size of fake
ind_true_test = random.sample(list(range(N_true)), rand_true) # indices of true test
ind_fake_test = random.sample(list(range(N_fake)), rand_fake) # indices of fake test
ind_true_train = list(set(list(range(N_true))) - set(ind_true_test)) # indices of true train
ind_fake_train = list(set(list(range(N_fake))) - set(ind_fake_test)) # indices of fake train

true_train = true_csv[true_csv.index.isin(ind_true_train)]['text']
fake_train = fake_csv[fake_csv.index.isin(ind_fake_train)]['text']
true_test = true_csv[true_csv.index.isin(ind_true_test)]['text']
fake_test = true_csv[true_csv.index.isin(ind_fake_test)]['text']

In [8]:
# testing
R = [[], [], [], []]

for i in range(2000):
    R[0].append(true_csv.iloc[i])
    R[1].append(fake_csv.iloc[i])
    
for i in range(500):
    R[2].append(true_csv.iloc[i])
    R[3].append(fake_csv.iloc[i])
    
# for i in range(4):
#     print(len(R[i]))

2000
2000
500
500


In [9]:
for i in range(2000):
    R[0][i] = R[0][i][1]
    R[1][i] = R[1][i][1]
    
for i in range(500):
    R[2][i] = R[2][i][1]
    R[3][i] = R[3][i][1]

In [10]:
all_doc = []
for i in range(2000):
    all_doc.append(R[0][i])
for i in range(2000):
    all_doc.append(R[1][i])
for i in range(500):
    all_doc.append(R[2][i])
for i in range(500):
    all_doc.append(R[3][i])

In [11]:
true_train.head()

0    WASHINGTON (Reuters) - The head of a conservat...
1    WASHINGTON (Reuters) - Transgender people will...
2    WASHINGTON (Reuters) - The special counsel inv...
4    SEATTLE/WASHINGTON (Reuters) - President Donal...
5    WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...
Name: text, dtype: object

In [12]:
true_list = true_train.tolist()
fake_list = fake_train.tolist()

In [13]:
N_true_train = len(true_list)
print("# true = ", N_true_train)
N_fake_train = len(fake_list)
print("# fake = ", N_fake_train)

# true =  18872
# fake =  21503


## Training

### Processing Training Data

#### Lowercase & split

In [14]:
def lower_split(L): # L: list
    n = len(L)
    
    # lower
    for i in range(n):
        L[i] = L[i].lower()
    
    # split
    punctuation_list = [".,!?\'\"-{}[]():;`@#$%^&*~+=/|<>_"]
    token_list = [[] for i in range(n)]
    for i in range(n):
        for char in punctuation_list:
            L[i] = L[i].replace(char, " ")
        token_list[i] = L[i].split()
        k = len(token_list[i])
        for j in range(k):
            if len(token_list[i][k - j - 1]) <= 3 or token_list[i][k - j - 1].isalpha() == False:
                token_list[i].pop(k - j - 1)
        
    return token_list;

In [15]:
t_token_list = lower_split(true_list)
f_token_list = lower_split(fake_list)

#### Delete Stop words

In [16]:
swFile = open('stoplist.txt', 'r')
stopWords = swFile.read().splitlines()

In [17]:
stopwords_list = list(set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
                  'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 
                  'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 
                  'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 
                  'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 
                  'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                  'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 
                  'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 
                  'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn',
                  "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
                  "weren't", 'won', "won't", 'wouldn', "wouldn't"]))
stopwords_list = set(stopWords + stopwords_list)

In [18]:
def stopwords(L, stopwords_list):
    n = len(L)
    stop_list = []

    for i in range(n):
        temp_list = []
        for j in range(len(L[i])):
            if(L[i][j] not in stopwords_list):
                temp_list.append(L[i][j])
        stop_list.append(temp_list)
    
    return stop_list;

In [19]:
t_stop_list = stopwords(t_token_list, stopwords_list)
f_stop_list = stopwords(f_token_list, stopwords_list)

#### PorterStemmer

In [20]:
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [21]:
def stem(L):
    n = len(L)
    S = [[] for i in range(n)]
    
    for i in (range(n)):
        S[i] = [ps.stem(L[i][j]) for j in range(len(L[i]))]

    return S;

In [22]:
t_list_of_list = stem(t_stop_list)
f_list_of_list = stem(f_stop_list)

#### Delete Stop words

In [23]:
t_train = stopwords(t_list_of_list, stopwords_list)
f_train = stopwords(f_list_of_list, stopwords_list)

In [24]:
train = [] # true and fake mixed together
train.extend(t_train)
train.extend(f_train)

In [25]:
def Labels(D, N_true_train) :
    labels = [[] for i in range(len(D))]
    for i in range(len(D)):
        if i < N_true_train:
            labels[i] = 1
        else:
            labels[i] = 0
    return labels

### Feature Selection

#### Select Top k Features

In [28]:
def FeatLargestVal(L, k):
    L = L[L.index < k]
    return L;

#### Contingency Table

In [29]:
n_t = N_true_train + N_fake_train
n_c = 2

In [30]:
def Contingency(df_all, df_class, n_d): # df of a term in collection/ in the class/ docs num in class
    cont = np.zeros((3, 3))
    '''
    cont:             a term t
                present      absent      total
    on-topic    [0][0]       [0][1]       n_d
    off-topic   [1][0]       [1][1]     n_t - n_d
    total         S          n_t - S      n_t
    '''
    cont[0][2] = n_d
    cont[0][0] = df_class
    cont[0][1] = cont[0][2] - cont[0][0]
    
    cont[1][2] = n_t - n_d 
    cont[1][0] = df_all - cont[0][0]
    cont[1][1] = cont[1][2] - cont[1][0]
    
    cont[2][0] = df_all # df_all
    cont[2][1] = cont[0][1] + cont[1][1]
    cont[2][2] = cont[0][2] + cont[1][2]
    
    return cont;

#### Methods of Feature Selection

#### 1. Chi-square Feature Selection

In [31]:
def ChiSquare(df_all, df_class, n_d): # df of D / df of class C / docs num in class
    N = Contingency(df_all, df_class, n_d)
    E = np.zeros((2, 2))
    '''
    cont:             a term t
                present      absent      total
    on-topic    [0][0]       [0][1]     n_d = 
    off-topic   [1][0]       [1][1]     n_t - n_d
    total         S          n_t - S       n_t
    '''
    for i in range(2):
        for j in range(2):
            E[i][j] = n_t * (N[i][2] / n_t) * (N[2][j] / n_t)
    
    chi = float(0)
    for i in range(2):
        for j in range(2):
            chi += (N[i][j] - E[i][j]) ** 2 / E[i][j]
    
    return chi;

#### 2. Expected Mutual Information Feature Selection

In [32]:
def EMI(df_all, df_class, n_d): # df of D / df of class C / docs num in class
    N = Contingency(df_all, df_class, n_d)
    I = float(0)
    for i in range(2):
        for j in range(2):
            # smoothing!
            P_tc = (N[i][j] + 0.5) / (N[2][2] + 2)
            P_t = (N[2][j] + 1) / (N[2][2] + 2)
            P_c = (N[i][2] + 1) / (N[2][2] + 2)
            I = I + P_tc * np.log(P_tc / (P_t * P_c))
    
    return I;

#### 3. Log Likilihood Rate Feature Selection

In [33]:
def LLR(df_all, df_class, n_d): # df of D / df of class C / docs num in class
    N = Contingency(df_all, df_class, n_d)
    E = np.zeros((2, 2))
    '''
    cont:             a term t
                present      absent      total
    on-topic    [0][0]       [0][1]       n_d
    off-topic   [1][0]       [1][1]     n_t - n_d
    total         S          n_t - S      n_t
    '''
    N00 = N[1][1]
    N01 = N[0][1]
    N10 = N[1][0]
    N11 = N[0][0]
    N = N11+N10+N01+N00
    
    likelihood = float(0)
    likelihood = -2 * np.log ((((N11+N01)/N)**N11 * (1- (N11+N01)/(N))**N10 * ((N11+N01)/N)**N01 * (1- (N11+N01)/(N))**N00) / ((N11/(N11+N10))**N11 * (1-N11/(N11+N10))**N10 * (N01/(N01+N00))**N01 * (1-N01/(N01+N00))**N00))
    
    return likelihood;

#### Document Frequency

In [34]:
def DocFreqInClass(V, c): # V: train_df/ c = -1: for whole collection, else: c = 1 -> true, c = 0 -> false
    if c == 1:
        txt_C = V[V['label'] == 1]['terms'].tolist()
    elif c == 0:
        txt_C = V[V['label'] == 0]['terms'].tolist()
    else:
        txt_C = V['terms'].tolist()
    n = len(txt_C)
    termsUniq = []

    # make words unique in each document
    for i in (range(n)):
        tUniq = list(set(txt_C[i]))
        termsUniq.extend(tUniq)
    
    # compute df for the whole collection
    
    termsDF = pd.DataFrame(pd.Series(termsUniq).value_counts().to_frame('df')).reset_index()
    termsDF.columns = ['term', 'df']
    termsDF = termsDF.sort_values(by = 'df', ascending = False).reset_index().drop(columns = ['index'])
    return termsDF; # a dataframe

#### Single Feature Selection

In [35]:
def SelectFeatures(V, c, k, typ):
    # V: train_df / c: int (class)/ k: int (# of features)/ typ: int (type of selection)
    # c: 1 -> true, 0 -> fake
    # typ: 1 -> chi, 2 -> EMI, 3 -> LLR # single
    V_uniq = DocFreqInClass(V, -1) # unique terms in the whole training set
    txt_C = DocFreqInClass(V, c) # unique terms in class c
    n_v = len(txt_C) # number of unique terms in class c
    n_d = len(V[V['label'] == c])
    term = txt_C['term'].tolist()
    df_class = txt_C['df'].tolist()
    df_all = V_uniq['df'][V_uniq['term'].isin(txt_C['term'])].tolist()
    A = np.zeros(n_v)
    cArr = [c for i in range(k)]
    ind = [i for i in range(k)]
    L = pd.DataFrame()
    if typ == 1:
        for t in range(n_v):
            A[t] = ChiSquare(df_all[t], df_class[t], n_d) # n_d?
        L_cand = pd.DataFrame({'term': term, 'chi-sq': A}).sort_values(by = 'chi-sq', ascending = False).reset_index().drop(columns = ['index', 'chi-sq'])
        L = FeatLargestVal(L_cand, k);
    elif typ == 2:
        for t in range(n_v):
            A[t] = EMI(df_all[t], df_class[t], n_d)
        L_cand = pd.DataFrame({'term': term, 'EMI': A}).sort_values(by = 'EMI', ascending = False).reset_index().drop(columns = ['index', 'EMI'])
        L = FeatLargestVal(L_cand, k);
    elif typ == 3:
        for t in range(n_v):
            A[t] = LLR(df_all[t], df_class[t], n_d)
        L_cand = pd.DataFrame({'term': term, 'LLR': A}).sort_values(by = 'LLR', ascending = False).reset_index().drop(columns = ['index', 'LLR'])
        L = FeatLargestVal(L_cand, k);
    
    L.insert(1, 'class', cArr, True)
    return L;

#### Double Feature Selection

In [36]:
def SelectDoubleFeatures(V, c, k, typ): # typ: 1 -> chi, 2 -> EMI, 3 -> LLR
    V_uniq = DocFreqInClass(V, -1) # unique terms in the whole training set
    txt_C = DocFreqInClass(V, c) # unique terms in class c
    n_v = len(txt_C)
    term = txt_C['term']
    A = np.zeros((2, n_v)) # 2d: chi-square, EMI
    cArr = [c for i in range(k)]
    ind = [i for i in range(k)]
    for t in range(n_v):
        A[0][t] = ChiSquare(V_uniq, txt_C, term[t]) # A[0]: chi-sq
        A[1][t] = EMI(V_uniq, txt_C, term[t]) # A[1]: EMI
    
    L_cand = [[] for i in range(2)] # chi-sq, EMI
    L_cand[0] = pd.DataFrame({'term': term, 'chi-sq': A[0]}).sort_values(by = 'chi-sq', ascending = False).reset_index().drop(columns = ['index'])
    L_cand[1] = pd.DataFrame({'term': term, 'EMI': A[1]}).sort_values(by = 'EMI', ascending = False).reset_index().drop(columns = ['index'])
    
    L_top = pd.DataFrame()
    temp = [[] for i in range(2)]
    for i in range(2):
        temp[i] = L_cand[i].loc[:int(n_v * 0.2)] # top 20%
    L_top = pd.merge(temp[0], temp[1], how = 'outer', on = ['term']).sort_values(by = 'chi-sq', ascending = False).reset_index().drop(columns = ['index'])
    # union
    
    L_last = pd.DataFrame()
    for i in range(2):
        temp[i] = L_cand[i].loc[int(n_v * 0.2):]
    L_last = pd.merge(temp[0], temp[1], how = 'inner', on = ['term']).sort_values(by = 'EMI', ascending = False).reset_index().drop(columns = ['index'])
    # intersection
    
    L = pd.DataFrame()
    L_top = L_top.drop(columns = ['chi-sq', 'EMI'])
    if len(L_top) >= k: # L_top >= k terms
        L = FeatLargestVal(L_top, k)
    else:
        L_last = L_last.drop(columns = ['chi-sq', 'EMI'])
        L = pd.merge(L_top, L_last, how = 'outer', on = ['term']).reset_index().drop(columns = ['index'])
        L = FeatLargestVal(L, k)

    L['class'] = cArr
    return L;

#### Perform Feature Selection

In [37]:
def FeatureSelection(train_df, k, typ) :
    final = [[] for i in range(2)] # a list of dataframe
    for i in range(2):
        L = SelectFeatures(train_df, i, k, typ)
        final[i] = L
    final_all = pd.DataFrame()
    for i in range(2):
        k_1 = (k/2)-1
        final_all = final_all.append(final[i].loc[:k_1])
    final_all = final_all.reset_index().drop(columns = ['index'])
    ind = [i for i in range(k)]
    final_all.insert(2, 'i', ind, True)
    return final_all

### Represent Documents as Vectors

In [39]:
def DocToVector(D, f, is_training): #D: list of doc/ f: final feature/ is_training: bool
    #temp = []
    #if is_training == True:
        #temp = train
    #else:
        #temp = test
        
    df = []
    for i in range(len(f)):
        term = f.at[i,'term']
        CNT = 0
        for j in range(len(D)):
            if term in D[j]:
                CNT += 1
        df.append(CNT)
    
    vecDoc = [[] for i in range(len(D))]
    for i in range(len(D)):
        vecDoc[i] = [float(0) for j in range(len(f))]
        for j in range(len(f)):
            tf = D[i].count(f.at[j,'term'])
            ### smoothing? df = 0, division by zero
            if df[j] != 0:
                tfIdf = tf * np.log(len(D) / df[j])
            else:
                tfIdf = tf * np.log(len(D)) # df = 1
            vecDoc[i][j] = tfIdf
        # normalize
        v = np.array(vecDoc[i])
        l = float(1)
        if np.sum(v) != 0:
            l = np.sqrt(np.sum(v ** 2))
        vecDoc[i] = v * (1/l)
    return(vecDoc)

### Training Methods

#### 1. kNN

In [40]:
def ComputeNN(D, k, d):
    # D: list of numpy TF-IDF unit vectors of each training doc (dimension = # of features)
    # d: numpy TF-IDF unit vector of a testing doc (dimension = # of features)
    n = len(D)
    vecs = D['vec'].tolist()
    # assume vec and d are normalized
    cos = np.array([np.dot(vecs[i], d) for i in range(n)])
    ind = [i for i in range(n)]
    df = pd.DataFrame({'ind': ind, 'cos': cos}).sort_values(by = 'cos', ascending = False).reset_index().drop(columns = ['index'])
    kNN = FeatLargestVal(df, k)['ind'].tolist()
    return kNN

In [41]:
def ArgMaxScore(score): # np array of scores
    m = np.amax(score)
    argmax = list(score).index(m) # index
    return argmax

In [42]:
def ApplyKNN(D, k, d): # k = 3 or 5: common
    # D: dataframe of TF-IDF vectors + labels of training docs/ k: parameter for kNN/ d: TF-IDF vector of a doc
    Sk = ComputeNN(D, k, d) # list of k neighbors (index)
    p = np.zeros(2)
    for i in range(len(Sk)):
        if D.at[Sk[i], 'label'] == 1: # true
            p[1] = p[1] + 1
        else: # fake
            p[0] = p[0] + 1 
    p = p * (1/k)
    c = ArgMaxScore(p)
    return c # class: 1 or 0

#### 2. Naive Base Classification

In [43]:
def count_tct(text_c, t) :
    num = 0
    for term in text_c :
        if t == term :
            num += 1
    return num
def concatenate_text_of_all_docs(D) :
    text_c = []
    for i in range(len(D)) :
        text_c.extend(D[i])
    return text_c 

In [44]:
# D : training documents
# f : features
def TrainNB(t_train, f_train, f, N_true_train, N_fake_train) :
    prior = []
    condprob = []
    #print(N_fake_train)
    #print(N_true_train)
    for i in range(2) :
        if i == 0 :
            class_doc_num = N_fake_train
            D = f_train
            feature = f["term"].loc[:399]
        else :
            class_doc_num = N_true_train
            D = t_train
            feature = f["term"].loc[400:799]
        prior.append(class_doc_num / (N_fake_train + N_true_train))
        temp_tct = []
        temp_con = []
        total = 0
        text_c = concatenate_text_of_all_docs(D)
        for t in feature :
            tct = count_tct(text_c, t)
            temp_tct.append(tct)
            total += tct
        for t in feature :
            tct = count_tct(text_c, t)
            temp_con.append((tct+1) / (total + len(feature)))
        d = {"term" : feature , "condprob" : temp_con}
        condprob.append(pd.DataFrame(data = d))
    return prior, condprob

In [45]:
# W : tokens in document
# f : feature
def ApplyNB(W, prior, condprob, f) :
    score = []
    
    for i in range(2) :
        if i == 0 :
            feature = f["term"].loc[:399]
        else :
            feature = f["term"].loc[400:799]
        cur = math.log10(prior[i])
        for t in W :
            if t in feature :
                cur += math.log10(condprob[i]["condprob"].loc[condprob[i]["term"] == t])
            else :
                cur += math.log10(1/5000)
        score.append(cur)
    max_s = score[0]
    c = 0
    for i in range(len(score)) :
        if score[i] > max_s :
            max_s = score[i]
            c = i
    return c

In [46]:
#prior, condprob = TrainNB(t_train, f_train, final_all, N_true, N_fake)

In [47]:
# ans = []
# for i in range(len(test)) :
#    ans.append(ApplyNB(test[i], prior, condprob, final_all))
# print(ans)

#### 3. Rocchio

In [48]:
def TrainRocchio(D, doclen, N_true_train): #D: training doc, doclen: dimension of doc, index < N_true_train -> true
    u = []
    u.append([]) # for class true
    u.append([]) # for class fake
    for i in range(doclen): # i-dimension centroid
        sum_true = 0
        sum_fake = 0
        for j in range(N_true_train):
            sum_true += D[j][i]
        for j in range(N_true_train, len(D)):
            sum_fake += D[j][i]
        u[1].append(sum_true / N_true_train)
        u[0].append(sum_fake / (len(D) - N_true_train + 1))      
    return(u)

In [49]:
def ApplyRocchio(u, d): #u: centroid, d: a doc vector
    distance = [] # len(distance) = len(u) = 2(true / fake)
    for i in range(len(u)):
        sum = 0
        for j in range(len(d)): # j dimension
            sum += (u[i][j] - d[j])**2 
        distance.append(sum)
    return(np.argmin(distance)) 

In [50]:
#centroid = TrainRocchio(train_vec, len(final_all), N_true_train) # 1 for true, 0 for fake

In [51]:
# 1 for true, 0 for fake
# ans2 = []
# for i in range(len(test_vec)):
#    ans2.append(ApplyRocchio(centroid, test_vec[i]))
# print(ans)

## Cross Validation

In [52]:
ind_train = train_df.index.tolist()

### Split Data into 10 Folds

In [53]:
# I : document indicies
# D : training doc
# k : # of folds
def SplitData(I, D, k) :
    random.shuffle(I)
    ind_fold = []
    ind_fold.append([])
    fold = []
    fold.append([])
    fold_amount = int(len(I) / k)
    
    for i in range(k) :
        ind_cur = []
        cur = []
        for j in range(fold_amount) :
            ind_cur.append(I[i*fold_amount + j])
            cur.append(D[I[i*fold_amount + j]])
        ind_fold[0].append(ind_cur)
        fold[0].append(cur)

    # evenly assign the remaining document into folds
    remain = len(I) - fold_amount*k
    for i in range(remain) :
        ind_fold[0][i].append(I[fold_amount*k + i])
        fold[0][i].append(D[I[fold_amount*k + i]])
    return ind_fold[0], fold[0]

### Split True and Fake data

In [54]:
# I : document indicies
# D : training doc
def SeperateTrueFake(I, D, N_true_train) :
    new_t_train = []
    new_f_train = []
    labels = []
    for i in range(len(D)) :
        if I[i] < N_true_train :
            new_t_train.append(D[i])
            labels.append(1)
        else :
            new_f_train.append(D[i])
            labels.append(0)
    return new_t_train, new_f_train, labels

### Validation Performance

In [55]:
def CountF1(pre, recall) :
    if (pre + recall) == 0 :
        return 0
    F1 = (2*pre*recall) / (pre + recall)
    return F1

# fake : class 0 
# true : class 1 
# df : dataframe with two attributes, doc Id and doc Value (predict class)
def CountPreRecall(df, N_true_train) :
    tp = 0
    fn = 0 
    fp = 0
    tn = 0
    for i in range(len(df["Id"])) :
        # true value = class 1 and prediction = class 1
        if (df["Id"][i] - 1) < N_true_train and df["Value"][i] == 1 :
            tp += 1
        # true value = class 1 and prediction = class 0
        elif df["Id"][i] - 1 < N_true_train and df["Value"][i] == 0 :
            fn += 1
        # true value = class 0 and prediction = class 1
        elif df["Id"][i] - 1 >= N_true_train and df["Value"][i] == 1 :
            fp += 1
        else :
            tn += 1
    if tp == 0 :
        return 0, 0
    else :
        pre = tp / (tp + fp) # precision
        recall = tp / (tp + fn)
        return pre, recall

### Apply Training Methods

In [56]:
# D : training doc
# train_df : training dataframe
# val : test doc
# ind_val : test indicies
# new_t_train : true class training doc
# new_f_train : fake class training doc
# final_all : features
# typ : training type 1-> NB, 2 -> Rocchio, 3 -> kNN
def TrainingMethod(new_train, train_df, val, ind_val, new_t_train, new_f_train, final_all, typ) :
    doc_id = []
    value = []
    # training : NB
    if typ == 1 :
        prior, condprob = TrainNB(new_t_train, new_f_train, final_all, len(new_t_train), len(new_f_train))
        for i in range(len(val)) :
            doc_id.append(ind_val[i] + 1)
            value.append(ApplyNB(val[i], prior, condprob, final_all))
            
    # training : Rocchio
    elif typ == 2 :
        train_vec = DocToVector(new_train, final_all, True)
        test_vec = DocToVector(val, final_all, False)
        centroid = TrainRocchio(train_vec, len(final_all), N_true)
        for i in range(len(test_vec)):
            doc_id.append(ind_val[i] + 1)
            value.append(ApplyRocchio(centroid, test_vec[i]))
            
    # training : kNN
    elif typ == 3 :    
        train_vec = DocToVector(new_train, final_all, True)
        test_vec = DocToVector(val, final_all, False)
        train_vec_df = pd.DataFrame({'vec': train_vec, 'label': train_df['label']})
        doc_id = []
        value = []
        for i in range(len(test_vec)):
            doc_id.append(ind_val[i] + 1)
            value.append(ApplyKNN(train_vec_df, 5, test_vec[i]))
            
    d = {"Id" : doc_id , "Value" : value}
    df = pd.DataFrame(data = d)
    return df

In [57]:
# I : document indicies
# D : training doc
# k : # of folds
def CrossValidation(I, D, k, N_true_train) :    
    # Split the data into 10 folds
    ind_fold = []
    fold = []
    ind_fold, fold = SplitData(I, D, k)
    ave = 0
    new_train = []
    new_train.append([])
    val = []
    val.append([])
    ind_new_train = []
    ind_new_train.append([])
    ind_val = []
    ind_val.append([])
    train_t = []
    train_t.append([])
    train_f = []
    train_f.append([])
    all_labels = []
    all_labels.append([])
    train_df = []
    for i in range(k) :
        temp_train = []
        temp_val = []
        temp_ind = []
        
        # val : take one fold for testing
        ind_val[0].append(ind_fold[i])
        val[0].append(fold[i])
        
        # new_train : merge the other nine folds
        for j in range(k) :
            if i != j :
                temp_ind.extend(ind_fold[j])
                temp_train.extend(fold[j])
        new_train[0].append(temp_train)  
        ind_new_train[0].append(temp_ind)
    for i in range(k) :
        temp_t, temp_f, labels = SeperateTrueFake(ind_new_train[0][i], new_train[0][i], N_true_train)
        train_t[0].append(temp_t)
        train_f[0].append(train_f)
        all_labels[0].append(labels)
        train_df.append(pd.DataFrame({'terms': new_train[0][i], 'label': labels}))
    return new_train[0], val[0], ind_new_train[0], ind_val[0], train_t[0], train_f[0], all_labels[0], train_df

### Approaches of Cross Validation

#### 1. 300 Features

In [58]:
k = 10
fe_select = ["Chi-square", "EMI", "LLR"]
training_method = ["NB", "Rocchio", "kNN"]
final_all = []
train_df = []
train_t = []
train_f = []
all_labels = []
new_train = []
val = []
ind_new_train = []
ind_val = []
new_train, val, ind_new_train, ind_val, train_t, train_f, all_labels, train_df = CrossValidation(ind_train, train, k, N_true_train)
# feature selection
for i in range(10) :
    final_all.append(FeatureSelection(train_df[i], 300, 1))

#### NB

In [59]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 1)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[0])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : NB
Feature Selection: Chi-square
F1 : 0.6370467439755327


#### Rocchio

In [60]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 2)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[1])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : Rocchio
Feature Selection: Chi-square
F1 : 0.4316579616572813


#### kNN

In [61]:
F1 = 0
maxF1 = 0
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 3)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[2])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : kNN
Feature Selection: Chi-square
F1 : 0.8625073745663484


#### 2. 500 Features

In [62]:
k = 10
fe_select = ["Chi-square", "EMI", "LLR"]
training_method = ["NB", "Rocchio", "kNN"]
final_all = []
train_df = []
train_t = []
train_f = []
all_labels = []
new_train = []
val = []
ind_new_train = []
ind_val = []
new_train, val, ind_new_train, ind_val, train_t, train_f, all_labels, train_df = CrossValidation(ind_train, train, k, N_true_train)
# feature selection
for i in range(10) :
    final_all.append(FeatureSelection(train_df[i], 500, 1))

#### NB

In [63]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 1)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[0])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : NB
Feature Selection: Chi-square
F1 : 0.6370322236662853


#### Rocchio

In [64]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 2)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[1])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : Rocchio
Feature Selection: Chi-square
F1 : 0.5978647003387705


#### kNN

In [65]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 3)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[2])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : kNN
Feature Selection: Chi-square
F1 : 0.8660578405201083


#### 3. 800 Features

In [66]:
k = 10
fe_select = ["Chi-square", "EMI", "LLR"]
training_method = ["NB", "Rocchio", "kNN"]
final_all = []
train_df = []
train_t = []
train_f = []
all_labels = []
new_train = []
val = []
ind_new_train = []
ind_val = []
new_train, val, ind_new_train, ind_val, train_t, train_f, all_labels, train_df = CrossValidation(ind_train, train, k, N_true_train)
# feature selection
for i in range(10) :
    final_all.append(FeatureSelection(train_df[i], 800, 1))

#### NB

In [67]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 1)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[0])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : NB
Feature Selection: Chi-square
F1 : 0.6370381079779972


#### Rocchio

In [68]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 2)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[1])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : Rocchio
Feature Selection: Chi-square
F1 : 0.36016670375920334


#### kNN

In [69]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 3)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[2])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : kNN
Feature Selection: Chi-square
F1 : 0.8660794511309172


#### 4. 1000 Features

In [70]:
k = 10
fe_select = ["Chi-square", "EMI", "LLR"]
training_method = ["NB", "Rocchio", "kNN"]
final_all = []
train_df = []
train_t = []
train_f = []
all_labels = []
new_train = []
val = []
ind_new_train = []
ind_val = []
new_train, val, ind_new_train, ind_val, train_t, train_f, all_labels, train_df = CrossValidation(ind_train, train, k, N_true_train)
# feature selection
for i in range(10) :
    final_all.append(FeatureSelection(train_df[i], 1000, 1))

#### NB

In [71]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 1)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[0])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : NB
Feature Selection: Chi-square
F1 : 0.6370303699016749


#### Rocchio

In [72]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 2)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[1])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : Rocchio
Feature Selection: Chi-square
F1 : 0.5538185959806378


#### kNN

In [73]:
ave = 0
# 10-fold cross validation
for j in range(10) :
    df = TrainingMethod(new_train[j], train_df[j], val[j], ind_val[j], train_t[j], train_f[j], final_all[j], 3)
    pre, recall = CountPreRecall(df, N_true_train)
    F1 = CountF1(pre, recall) * ((len(val[j])) / len(ind_train))
    ave += F1
print("Training Method :", training_method[2])
print("Feature Selection:", fe_select[0])
print("F1 :", ave)

Training Method : kNN
Feature Selection: Chi-square
F1 : 0.8645996522380022


## Testing

In [100]:
train_df = pd.DataFrame({'terms': train, 'label': Labels(train, N_true_train)})

### Processing Testing Data

In [101]:
true_list2 = true_test.tolist()
fake_list2 = fake_test.tolist()

In [102]:
N_true_test = len(true_list2)
N_fake_test = len(fake_list2)

In [103]:
N_fake_test

1801

In [104]:
N_true_test

2545

#### Lowercase & split

In [105]:
t_token_list = lower_split(true_list)
f_token_list = lower_split(fake_list)

#### Delete Stop words

In [106]:
t_stop_list = stopwords(t_token_list, stopwords_list)
f_stop_list = stopwords(f_token_list, stopwords_list)

#### PorterStemmer

In [107]:
t_list_of_list = stem(t_stop_list)
f_list_of_list = stem(f_stop_list)

#### Delete Stop words

In [108]:
t_test = stopwords(t_token_list, stopwords_list)
f_test = stopwords(f_token_list, stopwords_list)

In [109]:
test = []
test.extend(t_test)
test.extend(f_test)

In [110]:
labels = [[] for i in range(len(test))]
for i in range(len(test)):
    if i < N_true_test:
        labels[i] = 1
    else:
        labels[i] = 0

In [111]:
test_df = pd.DataFrame({'terms': test, 'label': labels})

### Feature Selection

In [112]:
final_all = pd.DataFrame()
final_all = FeatureSelection(train_df, 800, 1)

### Represent Documents as Vectors

In [113]:
train_vec = DocToVector(train, final_all, True)

In [114]:
train_vec_df = pd.DataFrame({'vec': train_vec, 'label': train_df['label']})

In [115]:
test_vec = DocToVector(test, final_all, False)

In [116]:
test_vec_df = pd.DataFrame({'vec': test_vec, 'label': test_df['label']})

### Testing Performance

In [117]:
def testing_performance(answer, predict):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(true_answer)):
        if(predict[i] == answer[i] == 1):
            tp = tp + 1
        elif(predict[i] == answer[i] == 0):
            tn = tn + 1
        elif(predict[i] == 1 and answer[i] == 0):
            fp = fp + 1
        elif(predict[i] == 0 and answer[i] == 1):
            fn = fn + 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1 = 2 * precision * recall / (precision + recall)

    print("precision = " + str(precision))
    print("recall = " + str(recall))
    print("F1 = " + str(F1))

### Testing KNN (with k = 3)

In [120]:
true_answer = []
for i in tqdm(range(len(test_vec))):
    true_answer.append(test_vec_df['label'][i])

100%|██████████| 40375/40375 [00:00<00:00, 124001.33it/s]


In [122]:
from tqdm import tqdm
ans = []
for i in tqdm(range(len(test_vec))):
    ans.append(ApplyKNN(train_vec_df, 3, test_vec[i]))

100%|██████████| 40375/40375 [1:01:17<00:00, 10.98it/s]


In [123]:
testing_performance(true_answer, ans)

precision = 0.12588664235996022
recall = 0.7461689587426326
F1 = 0.21542824730572885
