#  Detection of Fake News

In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import math
import operator
import re 
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
data_dir = ""
train_news_df = pd.read_csv(data_dir + 'fake_news_train.csv')

train_index = [5547, 16639]
test_index = [0 , 5546]

# Train Data
headlines_labels_arr = train_news_df[['title', 'text','label']][train_index[0]: train_index[1]].values.astype('U').tolist()
headlines_arr_real =[]
headlines_arr_fake =[]
for line in headlines_labels_arr:
    if(int(line[2]) == 0):
        headlines_arr_real.append(str(line[0] + " " + line[1]))
    if(int(line[2]) == 1):
        headlines_arr_fake.append(str(line[0] + " " + line[1]))
        

#print(len(headlines_labels_arr)) #11092
#print(len(headlines_arr_fake)) #5552
#print(len(headlines_arr_real)) # 5540


# Test Data
test_headlines = []
test_headlines.extend(headlines_labels_arr[test_index[0]: test_index[1]])
    

In [None]:
prob_fake = len(headlines_arr_fake) /len(headlines_labels_arr)
prob_real = len(headlines_arr_real) /len(headlines_labels_arr)

print("Probability of being fake = ", prob_fake)
print("Probability of being real = ", prob_real)

Probability of being fake =  0.5005409304002885
Probability of being real =  0.4994590695997115


create_vectorizer() function returns a list and dictionary. Size of the list equal to number of word in fake or real and it has information about how many times these word repeated. Dictionary has which word indicated which index of my array. 

In [None]:
def create_vectorizer(arr, ngram):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(ngram, ngram))
    X = vectorizer.fit_transform(arr).toarray()
    X = np.array(np.sum(X, axis=0))
    d1 = vectorizer.fit(arr).vocabulary_
    return X, d1

I extracted words that non-English and has non alphabetic character from my train dictionary to obtain more clear data.

In [None]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [None]:
# unigram fake data
fake_arr_unigram, fake_d_unigram = create_vectorizer(headlines_arr_fake, 1)
# unigram real data
real_arr_unigram, real_d_unigram = create_vectorizer(headlines_arr_real, 1)


# bigram fake data
fake_arr_bigram, fake_d_bigram = create_vectorizer(headlines_arr_fake, 2)
# bigram real data
real_arr_bigram, real_d_bigram = create_vectorizer(headlines_arr_real, 2)

count_of_unique_words() function finds words that only appear in fake or real news

In [None]:
def count_of_unique_words(dict1,dict2):
    common_value = dict1.keys() & dict2.keys()
    return len(dict1.keys())+len(dict2.keys())-len(common_value)

number_of_unique_word = count_of_unique_words(fake_dic_withcounts_unigram, real_dic_withcounts_unigram)


from sklearn.feature_extraction import stop_words
stop_words = list(stop_words.ENGLISH_STOP_WORDS)

Dictionaries named fake_dic_withcounts_unigram and real_dic_withcounts_unigram has words and count informations of them. There is not any words that have non-English character and non-alphabetic character. Also stop-words are extracted.

In [None]:
fake_dic_withcounts_unigram = {}
real_dic_withcounts_unigram = {}
for k , v in fake_d_unigram.items():
    if(isEnglish(k) == True and k.isalpha() and (k not in stop_words)):
        fake_dic_withcounts_unigram[k]=fake_arr_unigram[v]
    
for k , v in real_d_unigram.items():
    if(isEnglish(k) == True and k.isalpha() and (k not in stop_words)):
        real_dic_withcounts_unigram[k]=real_arr_unigram[v]

## Part 1: Understanding the data

The data-set consists of real and fake news headlines with 8315 real headline and about 8325 fake headlines. By looking at the number of word occurrences, we can calculate probability of being real or fake. Also there many of stop-words like (”of”, ”for”, ”on” ... etc). 

I listed three examples below that I believe that has huge affect for classifying the headlines.

Function named find_uniques is for finding words that appear in fake news a lot when apperar in real news too little and vice versa.

In [None]:
def find_uniques(dic_fake, dic_real):
    print("USED WORDS IN FAKE NEWS: ")
    for k, v in dic_fake.items():
        if(v > 500):
            if (k not in dic_real.keys()):
                print ("Word: ", k )
                print("Fake Count: " , v)
                print("Real Count: " , 0)
            elif ( dic_real.get(k) < 50):
                print ("Word: ", k )
                print("Fake Count: " , v)
                print("Real Count: " , dic_real.get(k,0))
    print("\nUSED WORDS IN REAL NEWS: ")
    for k, v in dic_real.items():
        if(v > 400):
            if (k not in dic_fake.keys()):
                print ("Word: ", k )
                print("Real Count: " , v)
                print("Fake Count: " , 0)
            elif ( dic_fake.get(k) < 20):
                print ("Word: ", k )
                print("Real Count: " , v)
                print("Fake Count: " , dic_fake.get(k))
        
                
def display_dic(dic):
    for k,v in dic.items():
        print (k , " : " , v)

In [None]:
find_uniques(fake_dic_withcounts_unigram, real_dic_withcounts_unigram)

USED WORDS IN FAKE NEWS: 
Word:  www
Fake Count:  614
Real Count:  6
Word:  http
Fake Count:  706
Real Count:  0
Word:  der
Fake Count:  620
Real Count:  47
Word:  que
Fake Count:  1111
Real Count:  15
Word:  non
Fake Count:  892
Real Count:  14

USED WORDS IN REAL NEWS: 
Word:  spicer
Real Count:  485
Fake Count:  3
Word:  kushner
Real Count:  414
Fake Count:  8
Word:  gorsuch
Real Count:  442
Fake Count:  0
Word:  _____
Real Count:  911
Fake Count:  0


#### List the 10 non-stopwords that most strongly predict that the news is fake

In [None]:
print("---------------------------\nMOST USED FAKE NONSTOP-WORDS: \n---------------------------\n")
res = dict(sorted(fake_dic_withcounts_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])
display_dic(res)

---------------------------
MOST USED FAKE NONSTOP-WORDS: 
---------------------------

trump  :  11299
clinton  :  10449
people  :  8794
hillary  :  7235
said  :  6190
just  :  5735
new  :  5692
like  :  5487
time  :  5040
world  :  5027


#### List the 10 non-stopwords that most strongly predict that the news is real

In [None]:
print("---------------------------\nMOST USED REAL NONSTOP-WORDS: \n---------------------------\n")
res = dict(sorted(real_dic_withcounts_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])
display_dic(res)

---------------------------
MOST USED REAL NONSTOP-WORDS: 
---------------------------

said  :  36768
mr  :  34888
trump  :  20825
new  :  14145
people  :  10817
president  :  9149
like  :  8593
york  :  6913
times  :  6841
ms  :  6720


As we have seen above, some words are seen many times in their cluster but we encounter to them in their opposite cluster very few.

#### Fake Headlines' Words
* que    : 
    * Fake Count: 1111  
    * Real Count: 15
    * Rate: 0.9975


* non      : 
    * Fake Count: 892  
    * Real Count: 14
    * Rate: 0.9958


* http   : 
    * Fake Count: 706  
    * Real Count: 0 
    * Rate: 0.9973

#### Real Headlines' Words

* kushner    : 
    * Real Count: 414  
    * Fake Count: 8
    * Rate: 0.9986


* gorsuch      : 
    * Real Count: 442  
    * Fake Count: 0
    * Rate: 0.9998


* spicer   : 
    * Real Count: 485 
    * Fake Count: 3
    * Rate: 0.9984




## Part 2 : Implementing Naive Bayes

In [None]:
#Probability that a word being fake

def fake_probability(word, arr_real, arr_fake, d_fake, d_real):
    real_index = d_real.get(word, 0.0)
    fake_index = d_fake.get(word, 0.0)
    
    if fake_index == 0 :
        return (0 + 1) / ( len(d_fake) + number_of_unique_word)
    else:
        return (arr_fake[fake_index] + 1) / ( len(d_fake) )

#Probability that a word being real

def real_probability(word, arr_real, arr_fake, d_fake, d_real):
    real_index = d_real.get(word, 0.0)
    fake_index = d_fake.get(word, 0.0)
    
    if real_index == 0:
        return (0 + 1) / ( len(d_real) + number_of_unique_word)
    else:
        return (arr_real[real_index] + 1) / (len(d_real) )



In [None]:
# decide if naive bayes extacts stop-words by consideration
extract_stopwords = False 

def naive_bayes(sentences_tuple, arr_real, arr_fake, d_real, d_fake,ngram):
    print('Naive Bayes process started.\n------------------------------------')
    
    true, false = 0, 0
    real_prob, fake_prob = 0, 0
    
    for i,pair in enumerate(sentences_tuple):
        real_prob = 0
        fake_prob = 0
        prediction = 0
        
        current_title = pair[0]
        current_text = pair[1]
        curent_body = current_title + " " + current_text
        label = int(pair[2])
        

        for word in curent_title.split(' '):
            word = word.lower()
            word = re.sub(r'[^\w\s]','',word)
            if (extract_stopwords):
                if((word not in stop_words) and word.isnumeric() == False): 
                    fake_prob += math.log(fake_probability(word, arr_real, arr_fake, d_fake, d_real))
                    real_prob += math.log(real_probability(word, arr_real, arr_fake, d_fake, d_real))
            else:
                fake_prob += math.log(fake_probability(word, arr_real, arr_fake, d_fake, d_real))
                real_prob += math.log(real_probability(word, arr_real, arr_fake, d_fake, d_real))
                

        fake_prob += math.log(prob_fake)
        real_prob += math.log(prob_real)

        if real_prob >= fake_prob:
            prediction = 0
            #print("Prediction: Actual: " + pair[1])
            #print("Prediction: Found: " + str(prediction))

        else:
            prediction = 1
            #print("Prediction: Actual: " + pair[1])
            #print("Prediction: Found: " +str(prediction))

        if prediction == label:
            true += 1
        else:
            false += 1
            
    accuracy = 100 * (true / (true + false))
    
    if ngram==1:
        print("Unigram accuarcy -> ",accuracy)
    else:
        print("Bigram accuarcy -> ",accuracy)

    print('Naive Bayes process finished\n------------------------------------')

In [None]:
def test( arr_real, arr_fake, d_fake, d_real, ngram):
    if ngram == 1:
        naive_bayes(test_headlines, arr_real, arr_fake, d_real, d_fake,  1)
    elif ngram == 2:
        naive_bayes(test_headlines, arr_real, arr_fake, d_real, d_fake,  2)

### a) Analyzing effect of the words on prediction

###### PRESENCE

P(class|word) = P(word|class) * P(class) / (P(word|fake)*P(fake) + P(word|real)*P(real))

In [None]:
def presence(fake_dic,real_dic):
    fake_presence={}
    real_presence={}
    for key,value in fake_dic.items():
        #print("key: ", key)
        payda=(fake_dic.get(key)/len(headlines_arr_fake))+(real_dic.get(key,0.00001)/len(headlines_arr_real))
        probability=(fake_dic.get(key)/len(headlines_arr_fake))*prob_fake/payda
        #print("probability: ", probability)
        fake_presence[str(key)]=probability
    for key,value in real_dic.items():
        payda=(real_dic.get(key)/len(headlines_arr_real))+(fake_dic.get(key,0.00001)/len(headlines_arr_fake))
        probability=((real_dic.get(key)/len(headlines_arr_real))*prob_real)/payda
        real_presence[str(key)]=probability
    return fake_presence,real_presence

In [None]:
fake_presences_unigram, real_presences_unigram = presence(fake_dic_withcounts_unigram, real_dic_withcounts_unigram)

#### List the 10 words whose presence most strongly predicts that the news is fake.

In [None]:
dict(sorted(fake_presences_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'http': 0.500540929387145,
 'zu': 0.5005409266941779,
 'html': 0.5005409265339137,
 'kadzik': 0.5005409264701822,
 'como': 0.5005409263362014,
 'ist': 0.5005409257252602,
 'trunews': 0.5005409255011151,
 'auf': 0.5005409253631101,
 'sich': 0.5005409251792715,
 'nicht': 0.500540924940141}

#### List the 10 words whose presence most strongly predicts that the news is real.

In [None]:
dict(sorted(real_presences_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'gorsuch': 0.49945899052429493,
 'tillerson': 0.499458965886559,
 'gorka': 0.4994588564818778,
 'oesterlund': 0.49945883965679344,
 'durst': 0.49945882855610607,
 'haley': 0.49945882688219295,
 'macron': 0.49945879865924103,
 'awr': 0.49945878998915083,
 'awrhawkins': 0.49945878773422825,
 'pamkeynen': 0.49945878773422825}


###### ABSENCE

P(~word|class) = 1- P(word|class)

P(~word) = headlines without word / # headlines

P(class|~word) = P(~word|class)*P(class)/P(~word)



In [None]:
def absence_count(word):
    c=0
    for body in headlines_arr_fake:
        if word not in body.split(" "):
            c+=1
    for body in headlines_arr_real:
        if word not in body.split(" "):
            c+=1
    return c / (len(headlines_arr_real) + len(headlines_arr_fake)
#whole_words_unigram = list(set(list(fake_d_unigram.keys())+list(real_d_unigram.keys())))

def absence(fake_dic,real_dic):
    fake_absence={}
    real_absence={}
    f = dict(sorted(fake_presences_unigram.items(), key=operator.itemgetter(1), reverse=False)[:20])
    r = dict(sorted(real_presences_unigram.items(), key=operator.itemgetter(1), reverse=False)[:20])
    for word in f:
        payda = absence_count(word)
        prob=( 1- fake_dic.get(word,0.1) / len(headlines_arr_fake) )*prob_fake/payda
        fake_absence[word]=prob
    for word  in r:
        payda = absence_count(word)
        prob = (1 - (real_dic.get(word, 0.1) / len(headlines_arr_real))) * prob_real / payda
        real_absence[word] = prob

    return fake_absence,real_absence



In [None]:
fake_absences_unigram, real_absences_unigram = absence(fake_dic_withcounts_unigram, real_dic_withcounts_unigram)

#### List the 10 words whose absence most strongly predicts that the news is fake.

In [None]:
dict(sorted(fake_absences_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'weekdays': 0.5021587294006014,
 'onstage': 0.5010036349433437,
 'teammates': 0.5010036349433437,
 'lineup': 0.5009245802243364,
 'vouchers': 0.5007778304881579,
 'playoffs': 0.5006762848754508,
 'devos': 0.5004507753335737,
 'redstone': 0.5004507753335737,
 'mattis': 0.5004507753335737,
 'streep': 0.5004507753335737}

#### List the 10 words whose absence most strongly predicts that the news is real.

In [None]:
dict(sorted(real_absences_unigram.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'para': 0.5056624680598154,
 'th': 0.5054219488616982,
 'por': 0.5044662055389851,
 'su': 0.5039862215730946,
 'una': 0.5039747686422555,
 'und': 0.5024659883473656,
 'als': 0.5020694035611349,
 'neocon': 0.5020694035611349,
 'oligarchy': 0.5014337002374616,
 'te': 0.5013543504940082}

### b) Stopwords

In [None]:
fake_dic_withcounts_stopwords = {}
real_dic_withcounts_stopwords = {}
for k , v in fake_d_unigram.items():
    if(isEnglish(k) == True and k.isalpha() and (k in stop_words)):
        fake_dic_withcounts_stopwords[k] = fake_arr_unigram[v]
    
for k , v in real_d_unigram.items():
    if(isEnglish(k) == True and k.isalpha() and (k in stop_words)):
        real_dic_withcounts_stopwords[k] = real_arr_unigram[v]
        
def find_unique_stopwords(dic_fake, dic_real):
    print("USED STOP WORDS IN FAKE NEWS: \n--------------\n")
    
    for k, v in dic_fake.items():
        if(( (v / (v + dic_real.get(k,0))) > 0.8) and v >100):
            print ("Word: ", k )
            print("Fake Count: " , v)
            print("Real Count: " , dic_real.get(k,0))
            print("Rate: " , (v / (v + dic_real.get(k,0))) )
            print()
            
    print("\nUSED STOP WORDS IN REAL NEWS: \n--------------\n")
    for k, v in dic_real.items():
        if(( (v / (v + dic_fake.get(k,0))) > 0.7) and v >100):
            print ("Word: ", k )
            print("Real Count: " , v)
            print("Fake Count: " , dic_fake.get(k,0))
            print("Rate: ", (v / (v + dic_fake.get(k,0))))
            print()

In [None]:
dict(sorted(fake_dic_withcounts_stopwords.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'the': 200668,
 'to': 96287,
 'of': 95409,
 'and': 88135,
 'in': 64512,
 'that': 47356,
 'is': 43509,
 'for': 31228,
 'it': 28796,
 'on': 25928}

In [None]:
dict(sorted(real_dic_withcounts_stopwords.items(), key=operator.itemgetter(1), reverse=True)[:10])

{'the': 293054,
 'to': 133638,
 'of': 128600,
 'and': 117304,
 'in': 103705,
 'that': 69114,
 'for': 47298,
 'on': 45354,
 'he': 40916,
 'is': 40311}

As we seen above the most used stop words are generally common for both real and fake news. This means that whether or not taking into consideration these words is just waste of time and space. For not gaze upon just these most used stop words, I observed some stopwords that proportion of being fake or real is high.

In [None]:
find_unique_stopwords(fake_dic_withcounts_stopwords, real_dic_withcounts_stopwords)

USED STOP WORDS IN FAKE NEWS: 
--------------

Word:  etc
Fake Count:  368
Real Count:  51
Rate:  0.8782816229116945

Word:  co
Fake Count:  695
Real Count:  28
Rate:  0.9612724757952974

Word:  de
Fake Count:  3578
Real Count:  851
Rate:  0.8078573041318582

Word:  un
Fake Count:  1101
Real Count:  117
Rate:  0.9039408866995073

Word:  con
Fake Count:  368
Real Count:  28
Rate:  0.9292929292929293


USED STOP WORDS IN REAL NEWS: 
--------------

Word:  his
Real Count:  27651
Fake Count:  10162
Rate:  0.7312564461957528

Word:  had
Real Count:  18218
Fake Count:  6154
Rate:  0.747497127851633

Word:  he
Real Count:  40916
Fake Count:  13867
Rate:  0.7468740302648632

Word:  him
Real Count:  7625
Fake Count:  3106
Rate:  0.7105581958810921

Word:  last
Real Count:  5991
Fake Count:  2325
Rate:  0.7204184704184704

Word:  whether
Real Count:  2346
Fake Count:  999
Rate:  0.7013452914798206

Word:  seemed
Real Count:  1019
Fake Count:  194
Rate:  0.8400659521846661

Word:  she
Real Count:

#### Stopwords that most strongly predict that the news is fake

* con    : 
    * Fake Count: 368  
    * Real Count: 15
    * Rate: 0.9292


* un      : 
    * Fake Count: 1101  
    * Real Count: 117
    * Rate: 0.9039


* co   : 
    * Fake Count: 695  
    * Real Count: 28 
    * Rate: 0.9612

#### Stopwords that most strongly predict that the news is real

* seemed    : 
    * Real Count: 1019  
    * Fake Count: 194
    * Rate: 0.8400

* whose      : 
    * Real Count: 1447  
    * Fake Count: 580
    * Rate: 0.7110

* nine   : 
    * Real Count: 444  
    * Fake Count: 170
    * Rate: 0.7231

According to these results above, stopwords that most strongly predict that the news is real or fake are not too much by comparison with mostly used stopwords. By considering this inference I decided not to use stop-words in my model. 

### c) Analyzing effect of the stopwords

In [None]:
#Test unigram data with stopwords 
test(real_arr_unigram,fake_arr_unigram, fake_d_unigram, real_d_unigram,  1)

Naive Bayes process started.
------------------------------------
Unigram accuarcy ->  83.88027407140282
Naive Bayes process finished
------------------------------------


When I deactive affects of some stopwords and numeric characters over test accuracy, I clearly see it enhances my model. This process is increased my accuracy proportion of 10 percent. This is because of they don’t help us to find the context or the true meaning of a sentence. We are reducing the data set size is without any doubt and by doing this we can increase performance of our model. 

An example could be the following sentence: “Yes, There Are Paid Government Trolls On Social Media, Blogs, Forums And Websites”. When I remove the stop words, new sentence would be "yes paid government trolls social media blogs forums websites".

In [None]:
#Test unigram data after extract stopwords 
extract_stopwords =True
test(real_arr_unigram,fake_arr_unigram, fake_d_unigram, real_d_unigram,  1)

Naive Bayes process started.
------------------------------------
Unigram accuarcy ->  93.32852506310854
Naive Bayes process finished
------------------------------------


In [None]:
#Test bigram data after extract stopwords  
test(real_arr_bigram, fake_arr_bigram, fake_d_bigram, real_d_bigram, 2)

Naive Bayes process started.
------------------------------------
Bigram accuarcy ->  49.98196898665705
Naive Bayes process finished
------------------------------------


In my implementation unigram accuracy is very high by comparison with bigram accuracy. I think using bigram data is not suitable for our problem. Because of that I decided to use unigram data in by problem.