In [61]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import numpy as np
import statsmodels.api as sm
from io import StringIO

import preprocessor as pp
import emoji
from nltk.tokenize import TweetTokenizer
from sklearn.neighbors import KNeighborsClassifier

import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score,f1_score, recall_score, classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
numeric_feat = np.array(['retweet_count','reply_count', 'like_count', 'quote_count', 'followers count', 
                    'following count', 'tweet count', 'listed_count'])

cat_feat = np.array(['user is verified', 'user has url'])

date_feat = np.array(['created_at', 'user created at'])

text_feat = np.array(['text', 'user description'])


class1 = 'binary_class'
class2 = 'ternary_class'

unused_feat = ['number', 'user created at', 'created_at', 'source', 'user location', class2]

cls = class1

In [9]:
def preprocess_cat(df, c):
    # one hot encoding for categorical data
    df = pd.get_dummies(df, columns = [c])
    return df

In [10]:
# TODO
def preprocess_date(df, c):
    return df

In [46]:
def filter_text(df, c):
    arr = []
    for i in range (len(df)):
        #Preprocess the tweets
        oldtext=df[c][i]
        #removes hashtags
        newtext=' '.join(re.sub("(#[A-Za-z0-9]+)"," ",str(oldtext)).split())
        #removes UserID
        newtext=' '.join(re.sub("(@[A-Za-z0-9]+)"," ",str(newtext)).split())
        #removes urls
        newtext=' '.join(re.sub("(_URL_)"," ",str(newtext)).split())
        #removes additional urls as well
        newtext=' '.join(re.sub("(\w+:\/\/\S+)"," ",str(newtext)).split())
        #keeps only alphanumeric characters
        newtext= re.sub('[^a-zA-Z0-9]'," ",str(newtext))
        #converts to lower case and splits
        newtext=newtext.lower()
        newtext=newtext.split()
        #Performs Stemming    
        ps=PorterStemmer()
        newtext= [ps.stem(word) for word in newtext if word not in set(stopwords.words('english'))]
        newtext=' '.join(newtext)
        arr.append(newtext)
    df[c] = np.array(arr)
    return df
                    
def preprocess_text(df, c):
    df = filter_text(df,c)
    corpus=[]
    for i in range(len(df)):
        newtext=df[c][i]
#         if(len(str((newtext)))!=0 and str(newtext)!='nan'):
        corpus.append(newtext)

    cv = CountVectorizer(max_features = 1000)
    X2 = cv.fit_transform(corpus).toarray()
    c_names = cv.get_feature_names_out()
    c_names = [c + '_' + x for x in c_names]
    df = df.drop(c, axis=1)
#     for i in range(len(X2[0])):
#         df[c + c_names[i]] = np.array(X2[:,i])
    
    df1 = pd.DataFrame(np.array(X2), columns=c_names)
#     print(df1.columns)
    
    df = pd.concat([df,df1], axis=1)
#     print(df.columns)
    return df

In [47]:
# TODO
def preprocess_loc(df, c):
    return df

In [48]:
def convert_to_numeric(df):
    columns = df.columns
    for c in columns:
        if c in cat_feat:
            df = preprocess_cat(df, c)
        elif c in date_feat:
            df = preprocess_date(df, c)
        elif c in text_feat:
            df = preprocess_text(df, c)
    return df

In [49]:
def normalize(df):
    for feature in df.columns:
        if feature != class1 and feature != class2:
            feature_min = df[feature].min()
            feature_max = df[feature].max()
            df[feature] = (df[feature] - feature_min) / (feature_max - feature_min)    

    return df

In [50]:
def preprocess(df):
    df = convert_to_numeric(df)
    
    df = normalize(df)
    return df

In [51]:
ds_train = pd.read_csv('monkeypox.csv')
ds_test = pd.read_csv('monkeypox-followup.csv')

ds_train = ds_train.drop(unused_feat, axis=1)
ds_test = ds_test.drop(unused_feat, axis=1)
ds_test = ds_test.drop('beto_flag', axis=1)
ds_train = preprocess(ds_train)
ds_test = preprocess(ds_test)

cols_to_remove = np.setdiff1d(ds_test.columns, ds_train.columns)

cols_to_add = np.setdiff1d(ds_train.columns, ds_test.columns)

ds_test = ds_test.drop(cols_to_remove, axis=1)

df1 = pd.DataFrame([[0 for i in range(len(cols_to_add))] for c in range(len(ds_test))], columns=cols_to_add)

ds_test = pd.concat([ds_test, df1], axis=1)


In [52]:
x_train=ds_train.drop(cls,axis=1)
x_test=ds_test.drop(cls,axis=1)
y_train=ds_train[cls]
y_test=ds_test[cls]

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = 0)

In [59]:
from sklearn.svm import SVC
from sklearn import svm
SVC2 = SVC(kernel = 'linear', random_state = 0)
SVC2.fit(x_train, y_train)
y_pred=SVC2.predict(x_test)

In [64]:
print("The Accuracy using Support Vector Clustering",accuracy_score(y_test,y_pred)*100,"%")
print("The Precision using Supprt Vector Clustering",precision_score(y_test,y_pred))
print("The Recall using Support Vector Clustering",recall_score(y_test,y_pred))
print("The F1 score using Supprt Vector Clustering",f1_score(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)




The Accuracy using Support Vector Clustering 88.77374784110536 %
The Precision using Supprt Vector Clustering 0.7621951219512195
The Recall using Support Vector Clustering 0.5787037037037037
The F1 score using Supprt Vector Clustering 0.6578947368421053
Confusion Matrix: 
 [[903  39]
 [ 91 125]]


In [68]:
from sklearn.ensemble import RandomForestClassifier
classifierrf = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state = 0)
classifierrf.fit(x_train, y_train)
y_pred=classifierrf.predict(x_test)

In [70]:
print("The Accuracy using Random Forest",accuracy_score(y_test,y_pred)*100,"%")
print("The Precision using Random Forest",precision_score(y_test,y_pred))
print("The Recall using Random Forest",recall_score(y_test,y_pred))
print("The F1 score using Random Forest",f1_score(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)




The Accuracy using Random Forest 89.2055267702936 %
The Precision using Random Forest 0.9504950495049505
The Recall using Random Forest 0.4444444444444444
The F1 score using Random Forest 0.6056782334384858
Confusion Matrix: 
 [[937   5]
 [120  96]]


In [73]:
from sklearn.tree import DecisionTreeClassifier

classifierdt =DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifierdt.fit(x_train, y_train)
y_pred=classifierdt.predict(x_test)

In [74]:
print("The Accuracy using Decision Trees",accuracy_score(y_test,y_pred)*100,"%")
print("The Precision using Decision Trees",precision_score(y_test,y_pred))
print("The Recall using Decision Trees",recall_score(y_test,y_pred))
print("The F1 score using Decision Trees",f1_score(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)


The Accuracy using Decision Trees 87.13298791018998 %
The Precision using Decision Trees 0.675392670157068
The Recall using Decision Trees 0.5972222222222222
The F1 score using Decision Trees 0.6339066339066338
Confusion Matrix: 
 [[880  62]
 [ 87 129]]


In [79]:
from sklearn.neighbors import KNeighborsClassifier

classifierknn =KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski')
classifierknn.fit(x_train, y_train)
y_pred=classifierknn.predict(x_test)

In [80]:
print("The Accuracy using Decision Trees",accuracy_score(y_test,y_pred)*100,"%")
print("The Precision using Decision Trees",precision_score(y_test,y_pred))
print("The Recall using Decision Trees",recall_score(y_test,y_pred))
print("The F1 score using Decision Trees",f1_score(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)


The Accuracy using Decision Trees 84.6286701208981 %
The Precision using Decision Trees 1.0
The Recall using Decision Trees 0.17592592592592593
The F1 score using Decision Trees 0.2992125984251968
Confusion Matrix: 
 [[942   0]
 [178  38]]


In [82]:
from sklearn.naive_bayes import GaussianNB
classifierGB = GaussianNB()
classifierGB.fit(x_train, y_train)
y_pred = classifierGB.predict(x_test)

In [83]:
print("The Accuracy using Naive Bayes",accuracy_score(y_test,y_pred)*100,"%")
print("The Precision using Naive Bayes",precision_score(y_test,y_pred))
print("The Recall using Naive Bayes",recall_score(y_test,y_pred))
print("The F1 score using Naive Bayes",f1_score(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: \n", cm)


The Accuracy using Naive Bayes 58.89464594127807 %
The Precision using Naive Bayes 0.28114478114478114
The Recall using Naive Bayes 0.7731481481481481
The F1 score using Naive Bayes 0.41234567901234565
Confusion Matrix: 
 [[515 427]
 [ 49 167]]
