In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

### Reading Data

In [2]:
data = pd.read_csv(r'Clean Data.csv')
data.head()

Unnamed: 0,Tweets,target(fake=0)
0,good news covafirst vaccine get approval human...,1
1,country first indigenous corona vaccine covade...,1
2,india first corona vaccine candidate cova set ...,1
3,anildeshmukhncp pypayurved bought untested hom...,1
4,mohap announces new corona case recovery,1


### Classes

In [3]:
class Preprocess:
    
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
        
    def clean(self):
        
        for i in range(len(self.df)):

            text = self.df.loc[i,"Tweets"]
            
            # Removing special syntax
            text = re.sub(r"(b')+" , "" , text)

            # Removing URls
            text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(https?://[^\s]+))' , "" , text)
            text = re.sub(r'http\S+' , "" , text)

            # Removing Usernames
            text = re.sub(r'@[^\s]+' , "" , text)

            # Removing Hashtags
            text = re.sub(r'#([^\s]+)' , r'\1' , text)

            # Removing HTML Tags
            text = re.sub(r'<.*?>' , "" , text)

            # Removing Emogis
            emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
            text = emoji_pattern.sub(r'', text) 

            # Removing special Emogis
            text = (text.
                    replace('\\xe2\\x80\\x99', "'").
                    replace('\\xc3\\xa9', 'e').
                    replace('\\xe2\\x80\\x90', '-').
                    replace('\\xe2\\x80\\x91', '-').
                    replace('\\xe2\\x80\\x92', '-').
                    replace('\\xe2\\x80\\x93', '-').
                    replace('\\xe2\\x80\\x94', '-').
                    replace('\\xe2\\x80\\x94', '-').
                    replace('\\xe2\\x80\\x98', "'").
                    replace('\\xe2\\x80\\x9b', "'").
                    replace('\\xe2\\x80\\x9c', '"').
                    replace('\\xe2\\x80\\x9c', '"').
                    replace('\\xe2\\x80\\x9d', '"').
                    replace('\\xe2\\x80\\x9e', '"').
                    replace('\\xe2\\x80\\x9f', '"').
                    replace('\\xe2\\x80\\xa6', '...').
                    replace('\\xe2\\x80\\xb2', "'").
                    replace('\\xe2\\x80\\xb3', "'").
                    replace('\\xe2\\x80\\xb4', "'").
                    replace('\\xe2\\x80\\xb5', "'").
                    replace('\\xe2\\x80\\xb6', "'").
                    replace('\\xe2\\x80\\xb7', "'").
                    replace('\\xe2\\x81\\xba', "+").
                    replace('\\xe2\\x81\\xbb', "-").
                    replace('\\xe2\\x81\\xbc', "=").
                    replace('\\xe2\\x81\\xbd', "(").
                    replace('\\xe2\\x81\\xbe', ")")
                         )    

            # Lower and stopwords removal
            text = " ".join([word for word in text.lower().split() if not word in set(stopwords.words('english'))])

            # Punctuation Removal
            text = "".join([char if char not in string.punctuation else ' ' for char in text])

            # Number Removal
            text = re.sub(r"[^A-Z a-z]" , "" , text)

            # Emogi Residual Removal
            text = re.sub(r"x[a-z]+ " , "" , text)

            # Removing words or length less than 2
            text = " ".join([word for word in text.split() if len(word)>2])

            # Removing double or trailing spaces
            text = " ".join(text.split())
            
            # Similar word removal
            corona_similar = ['novelcoronavirus' , 'covid19' , 'covid' , 'corona' , 'coronavirus']
            india_similar = ['indian']
            for word in corona_similar:
                text = re.sub(word , "corona" , text)
            for word in india_similar:
                text = re.sub(word , "india" , text)

            # Lemmatize
            lemmatizer = WordNetLemmatizer()
            text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
            
            self.df.loc[i,"Tweets"] = text

In [4]:
class Word_Count:
    
    def __init__(self , data):
        self.data = data.reset_index(drop=True)
        self.true_count = []
        self.false_count = []
        self.true_num = 0
        self.false_num = 0
        
    def count(self, prob):
                
        ##  ----------------------------  TRUE COUNT  ----------------------------  ##

        df = self.data[self.data['target(fake=0)']==1].reset_index(drop=True)

        if (prob == "word"):
    
            unique = []
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    if word not in unique:
                        unique.append(word)
            self.true_count = dict(zip(unique , np.zeros(len(unique))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    self.true_count[word] += 1            
            
            for word in self.true_count.keys():
                self.true_num += self.true_count[word]

        else:
            
            unique_word = []
            for i in range(len(df)):
                unique = []
                text = df.loc[i,"Tweets"].split(", ")
                for word in text:
                    if (text.count(word)>1 and (word not in unique) or text.count(word)==1): 
                        unique.append(word)
                        if word not in unique_word:
                            unique_word.append(word)
                df.loc[i,"Tweets"] = ",".join(unique)
            self.true_count = dict(zip(unique_word , np.zeros(len(unique_word))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(","):
                    self.true_count[word] += 1
            
            self.true_num = len(df)
                
        ##  ----------------------------  FALSE COUNT  ----------------------------  ##

        df = self.data[self.data['target(fake=0)']==0].reset_index(drop=True)

        if (prob == "word"):
    
            unique = []
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    if word not in unique:
                        unique.append(word)
            self.false_count = dict(zip(unique , np.zeros(len(unique))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    self.false_count[word] += 1            
            
            for word in self.false_count.keys():
                self.false_num += self.false_count[word]

        else:
            
            unique_word = []
            for i in range(len(df)):
                unique = []
                text = df.loc[i,"Tweets"].split(", ")
                for word in text:
                    if (text.count(word)>1 and (word not in unique) or text.count(word)==1): 
                        unique.append(word)
                        if word not in unique_word:
                            unique_word.append(word)
                df.loc[i,"Tweets"] = ",".join(unique)
            self.false_count = dict(zip(unique_word , np.zeros(len(unique_word))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(","):
                    self.false_count[word] += 1
            
            self.false_num = len(df)

In [5]:
class Word_Count:
    
    def __init__(self , data):
        self.data = data.reset_index(drop=True)
        self.true_count = []
        self.false_count = []
        self.true_num = 0
        self.false_num = 0
        
    def count(self, prob):
                
        if (prob == "word"):
            
            ##  ----------------------------  TRUE COUNT  ----------------------------  ##
            
            unique = []
            
            df = self.data[self.data['target(fake=0)']==1].reset_index(drop=True)
    
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    if word not in unique:
                        unique.append(word)
            self.true_count = dict(zip(unique , np.zeros(len(unique))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    self.true_count[word] += 1            
            
            for word in self.true_count.keys():
                self.true_num += self.true_count[word]
                
            ##  ----------------------------  FALSE COUNT  ----------------------------  ##
            
            unique = []
            
            df = self.data[self.data['target(fake=0)']==0].reset_index(drop=True)

            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    if word not in unique:
                        unique.append(word)
                        
            self.false_count = dict(zip(unique , np.zeros(len(unique))))

            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(", "):
                    self.false_count[word] += 1            

            for word in self.false_count.keys():
                self.false_num += self.false_count[word]
                
        else:
            
            ##  ----------------------------  TRUE COUNT  ----------------------------  ##
            
            unique_word = []
            
            df = self.data[self.data['target(fake=0)']==1].reset_index(drop=True)
            
            for i in range(len(df)):
                unique = []
                text = df.loc[i,"Tweets"].split(", ")
                for word in text:
                    if (text.count(word)>1 and (word not in unique) or text.count(word)==1): 
                        unique.append(word)
                        if word not in unique_word:
                            unique_word.append(word)
                df.loc[i,"Tweets"] = ",".join(unique)
                
            self.true_count = dict(zip(unique_word , np.zeros(len(unique_word))))
            
            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(","):
                    self.true_count[word] += 1
            
            self.true_num = len(df)
                
            ##  ----------------------------  FALSE COUNT  ----------------------------  ##
            
            unique_word = []
            
            df = self.data[self.data['target(fake=0)']==0].reset_index(drop=True)

            for i in range(len(df)):
                unique = []
                text = df.loc[i,"Tweets"].split(", ")
                for word in text:
                    if (text.count(word)>1 and (word not in unique) or text.count(word)==1): 
                        unique.append(word)
                        if word not in unique_word:
                            unique_word.append(word)
                df.loc[i,"Tweets"] = ",".join(unique)
                
            self.false_count = dict(zip(unique_word , np.zeros(len(unique_word))))

            for i in range(len(df)):
                for word in df.loc[i,"Tweets"].split(","):
                    self.false_count[word] += 1

            self.false_num = len(df)

In [6]:
class N_Grams:
    
    def __init__(self, data):
        
        self.ngram = data.reset_index(drop=True)
        
    def ngrams(self, n_grams):
        
        #### ------------------------------- UNIGRAMS ------------------------------- ####
        
        if ((1 == n_grams[0]) and (1 == n_grams[1])):
            
            for i in range(len(self.ngram)):
                self.ngram.loc[i,"Tweets"] = ", ".join(self.ngram.loc[i,"Tweets"].split())
        
        #### ------------------------------- BIGRAMS ------------------------------- ####
        
        elif ((2 == n_grams[0]) and (2 == n_grams[1])):
            
            for i in range(len(self.ngram)):
                bigram = []
                text = self.ngram.loc[i,"Tweets"].split()
                for j in range(len(text)-1):
                    bigram.append(text[j]+" "+text[j+1])
                self.ngram.loc[i,"Tweets"] = re.sub(r"[\[\]]" , "" , str(bigram))
        
        #### ------------------------------- UNIGRAMS + BIGRAMS ------------------------------- ####
        
        else:
            
            for i in range(len(self.ngram)):
            
                bigram = []
                text = self.ngram.loc[i,"Tweets"].split()

                for j in range(len(text)-1):
                    bigram.append(text[j]+" "+text[j+1])

                bigram = re.sub(r"[\[\]]" , "" , str(bigram))
                unigram = re.sub(r"[\[\]]" , "" , str(self.ngram.loc[i,"Tweets"].split()))

                self.ngram.loc[i,"Tweets"] = unigram + ", " + bigram    

In [7]:
class Naive_Bayes(Preprocess , Word_Count, N_Grams):
        
    def __init__(self, prob_type='word', n_grams = (1,2)):
        self.true_dict = []
        self.false_dict = []
        self.predicted = []
        self.n_grams = n_grams
        self.prob_type = prob_type
        
    def fit(self, train_data):    
        
        N_Grams.__init__(self, train_data)
        self.ngrams(self.n_grams)            
        Word_Count.__init__(self, self.ngram)
        self.count(self.prob_type)
           
    def predict(self, test_data):
        
        Preprocess.__init__(self, test_data)
        self.clean()

        N_Grams.__init__(self, self.df)
        self.ngrams(self.n_grams)
        data = self.ngram.copy()

        for i in range(len(data)):
            
            alpha = 0
            true_count, false_count = 0 , 0
            
            self.true_dict = self.true_count.copy()
            self.false_dict = self.false_count.copy()
            
            text = data.loc[i,"Tweets"].split(", ")

            for word in text:
                if ((word not in self.true_dict.keys()) and (word not in self.false_dict.keys())):
                    self.true_dict[word] , self.false_dict[word] = 0 , 0
                    alpha = 1
                    true_count += 1
                    false_count += 1
        
                elif ((word not in self.true_dict.keys()) and (word in self.false_dict.keys())):
                    self.true_dict[word] = 0
                    alpha = 1
                    true_count += 1
                    
                elif ((word in self.true_dict.keys()) and (word not in self.false_dict.keys())):
                    self.false_dict[word] = 0
                    alpha = 1
                    false_count += 1

            if self.prob_type == 'word':            
                true_prob = (np.array(list(self.true_dict.values())) + alpha) / (self.true_num + ((len(self.true_dict) + true_count) * alpha) + 5)
                false_prob = (np.array(list(self.false_dict.values())) + alpha) / (self.false_num + ((len(self.false_dict) + false_count) * alpha) + 5)
            else:
                true_prob = (np.array(list(self.true_dict.values())) + alpha) / (self.true_num + true_count + 5)
                false_prob = (np.array(list(self.false_dict.values())) + alpha) / (self.false_num + false_count + 5)

            true_prob = dict(zip(self.true_dict.keys() , true_prob))
            false_prob = dict(zip(self.false_dict.keys() , false_prob))

            True_Prob = len(self.true_dict) / (len(self.true_dict) + len(self.false_dict))
            False_Prob = 1 - True_Prob

            for word in text:
                True_Prob *= true_prob[word]
                False_Prob *= false_prob[word]

            if (np.log(True_Prob) > np.log(False_Prob)):
                self.predicted.append(1)
            else:
                self.predicted.append(0)
                
        return self.predicted

### Testing

In [8]:
train , test = train_test_split(data , test_size = 200 , random_state=42 , stratify=data.iloc[:,-1].values)

In [9]:
clf = Naive_Bayes(n_grams=(1,2), prob_type='word')
clf.fit(train)
pred = clf.predict(test)

In [11]:
confusion_matrix(test.iloc[:,-1].values , pred)

array([[105,  21],
       [ 15,  59]], dtype=int64)

In [12]:
print(classification_report(test.iloc[:,-1].values , pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85       126
           1       0.74      0.80      0.77        74

    accuracy                           0.82       200
   macro avg       0.81      0.82      0.81       200
weighted avg       0.82      0.82      0.82       200

