In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from enum import Enum
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix

class VectorizerType(Enum):
    COUNT = 1
    TFID = 2
    
class DatasetType(Enum):
    TWITTER = 1
    WIKI = 2
    FOX = 3
    REDDIT = 4
    STORMFRONT = 5
    HATECHECK = 6
    CONV_AI = 7
    FB_TY = 8
    TWI_FB = 9
    YT_REDDIT = 10

# Helper Methods

#### Load Dataset

In [2]:
def LoadDataset(FileName):
    df = pd.read_csv(FileName)
    fig = plt.figure(figsize=(3,3))
    sns.heatmap(df.isnull(),yticklabels=False)
    df.info()
    return df

#### Data Cleaning and Preprocessing

In [3]:
def basic_data_cleaning(df):
    # drop rows with null values
    df = df.dropna()

    # drop unnecessary columns
    df = df.drop(columns=['id','file_platform','file_language', 'file_name', 'labels'])

    #drop duplicates
    df = df.drop_duplicates('text')

    #change data type of binary labels to int
    df['binary_labels'] = df['binary_labels'].astype('int')

    #rename the column name to labels
    df = df.rename(columns={"binary_labels": "labels"})

    #see the changes made
    df.info()
    
    return df

In [4]:
def data_processing(df, datasetType):
    df = df.lower() #convert everything to lowercase
    if datasetType == DatasetType.TWITTER:
        df = re.sub(r"http\S+", "", df) #removing links
        df = re.sub(r"\@\S+", "", df) #removing usernames @...
        df = re.sub(r"[0-9]+","",df) #removing numbers
        df = re.sub(r"\b([a-z]+[0-9]+|[0-9]+[a-z]+)[a-z0-9]*\b","",df) #removing alphanumeric words
        df = re.sub(r"[^\w\s]","",df) #remove punctuations
        df = re.sub(r"amp","",df) #remove irrelevant words detected in wordcloud
        df = re.sub(r"rt","",df) 
        df = re.sub(r"mkr","",df) 
    elif datasetType == DatasetType.WIKI:
        df = re.sub(r"http\S+", "", df) #removing links
        df = re.sub(r"[0-9]+","",df) #removing numbers
        df = re.sub(r"[^\w\s]","",df) #remove punctuations
    
    # tokenization
    df_tokens = word_tokenize(df)
    
    #removing stopwords
    filtered_df = [w for w in df_tokens if not w in stop_words]
    
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_df = [lemmatizer.lemmatize(w) for w in filtered_df ]
        
    return " ".join(lemmatized_df)

#### Visualization

In [5]:
def visualize_label_distribution(df):
    fig = plt.figure(figsize=(3,3))
    sns.countplot(x='labels', data = df)

    fig = plt.figure(figsize=(3,3))
    colors = ("red", "gold")
    wp = {'linewidth':2, 'edgecolor':"black"}
    tags = df['labels'].value_counts()
    explode = (0.1, 0.1)
    tags.plot(kind='pie',autopct = '%1.1f%%', shadow=True, colors = colors, startangle =90, 
         wedgeprops = wp, explode = explode, label='')
    plt.title('Label Distribution in Twitter Dataset')

In [6]:
def visualize_word_cloud(Data, Title):
    text = ' '.join([word for word in Data])
    plt.figure(figsize=(10,10), facecolor='w')
    wordcloud = WordCloud(max_words=500, width=1600, height=800).generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(Title, fontsize = 19)
    plt.show()

#### Vectorize

In [7]:
def Vectorize(mode,minRange,maxRange):
    if mode == VectorizerType.COUNT:
        return CountVectorizer(analyzer='word', ngram_range=(minRange, maxRange))
    if mode == VectorizerType.TFID:
        return TfidfVectorizer(analyzer='word', ngram_range=(minRange, maxRange))

#### Train Test Validation Split

In [8]:
def train_validate_test_split(vectorizer, XData, YData):
    print("======== SHAPE =======")
    print(XData.shape)
    X = vectorizer.fit_transform(XData)
    print(X.shape)
    Y = YData
    x, x_test, y, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    x_train, x_validate, y_train, y_validate = train_test_split(x, y, test_size=0.1, random_state=42)
    return x_train, x_validate, x_test, y_train, y_validate, y_test

#### Get Best Model

In [9]:
def get_best_model(x_train, y_train, x_validate, y_validate, params):
    bestModel = None
    bestScore = -0.0000000001
    
    # hyperparameter tuning
    for i in params:
        model = MultinomialNB(alpha=i)
        model.fit(x_train, y_train)
        y_predict = model.predict(x_validate)
        score = f1_score(y_validate, y_predict, average='macro')
        if score > bestScore:
            bestScore = score
            bestModel = model
            
    return bestModel

#### Apply Naive Bayes

In [10]:
def GenericNaiveBayes(vect, df_X, df_Y):
    x_train, x_validate, x_test, y_train, y_validate, y_test = train_validate_test_split(vect, df_X, df_Y)
    params = [0.001,0.01,0.1,0.2,0.4,0.6,0.8, 1,2,3]
    model = get_best_model(x_train, y_train, x_validate, y_validate, params)
    y_predict = model.predict(x_test)
    f1 = EvaluatePerformance(y_test, y_predict)
    return model, f1, y_test, y_predict

In [None]:
#### Eva