## Imports and Constants

In [194]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from nltk.stem import PorterStemmer
import re
import string

TWEET_URL_REGEX = r'https:\/\/t\.co\/[a-zA-Z0-9]*((\ ){1}|$)'
PUNCTUATION_REGEX = re.compile('[%s]' % re.escape(string.punctuation))

## Text Preprocessing

In [195]:
def preprocess_tweet(tweet):
    # lowercase everything and strip
    tweet = tweet.lower().strip()
    # stemming
    stemmer = PorterStemmer()
    tweet = ' '.join(stemmer.stem(word) for word in tweet.split(' '))
    return tweet


def preprocess_data(csv_file):
    df = pd.read_csv(csv_file, header=0)
    df['tweet'] = df['tweet'].apply(preprocess_tweet)
    return df


# function to load data
def data_loader():
    train_df = preprocess_data('./data/Constraint_Train.csv')
    validation_df = preprocess_data('./data/Constraint_Val.csv')
    test_df = preprocess_data('./data/english_test_with_labels.csv')
    return train_df, validation_df, test_df

# P1: Naive Bayes

In [204]:
def naive_bayes(alphas, validation=True, test=False):
    train_df, validation_df, test_df = data_loader()
    count_vec = CountVectorizer()
    X_train = count_vec.fit_transform(train_df['tweet'])
    y_train = train_df['label']

    X_val = count_vec.transform(validation_df['tweet'])
    y_val = validation_df['label']

    X_test = count_vec.transform(test_df['tweet'])
    y_test = test_df['label']

    for alpha in alphas:
        mnb = MultinomialNB(alpha=alpha).fit(X_train, y_train)

        print('alpha = ', alpha)

        if validation:
            y_pred = mnb.predict(X_val)
            print('On validation fold')
            print('Accuracy:', accuracy_score(y_val, y_pred))
            print('F1 score:', f1_score(y_val, y_pred, average="macro"))

        if test:
            y_pred = mnb.predict(X_test)
            print('On test fold')
            print('Accuracy:', accuracy_score(y_test, y_pred))
            print('F1 score:', f1_score(y_test, y_pred, average="macro"))

naive_bayes([1, 0.5, 0.1, 0.2, 0.15, 0.12, 0.17])

naive_bayes([0.15], False, True)

#####################################################################
#                                                                   #
#                             P1                                    #
#                                                                   #
#       ======================================================      #
#               alpha =  1                                          #
#               On validation fold                                  #
#               Accuracy: 0.9205607476635514                        #
#               F1 score: 0.9203346732057345                        #
#               alpha =  0.5                                        #
#               On validation fold                                  #
#               Accuracy: 0.922429906542056                         #
#               F1 score: 0.9223102063096604                        #
#               alpha =  0.1                                        #
#               On validation fold                                  #
#               Accuracy: 0.9228971962616822                        #
#               F1 score: 0.9228023764887215                        #
#               alpha =  0.2                                        #
#               On validation fold                                  #
#               Accuracy: 0.922429906542056                         #
#               F1 score: 0.9223319480592169                        #
#               alpha =  0.15                                       #
#               On validation fold                                  #
#               Accuracy: 0.9228971962616822                        #
#               F1 score: 0.9228023764887215                        #
#               alpha =  0.12                                       #
#               On validation fold                                  #
#               Accuracy: 0.922429906542056                         #
#               F1 score: 0.9223419995033246                        #
#               alpha =  0.17                                       #
#               On validation fold                                  #
#               Accuracy: 0.922429906542056                         #
#               F1 score: 0.9223319480592169                        #
#               alpha =  0.15                                       #
#               On test fold                                        #
#               Accuracy: 0.927570093457944                         #
#               F1 score: 0.9274556439487138                        #
#       ======================================================      #                          
#       Best validation fold accuracy for alpha = 0.15              #
#       ======================================================      #
#       Accuracy and F1 on test fold for alpha = 0.15               #
#                                                                   #
#               Accuracy: 0.927570093457944                         #
#               F1 score: 0.9274556439487138                        #
#####################################################################



alpha =  1
On validation fold
Accuracy: 0.9205607476635514
F1 score: 0.9203346732057345
alpha =  0.5
On validation fold
Accuracy: 0.922429906542056
F1 score: 0.9223102063096604
alpha =  0.1
On validation fold
Accuracy: 0.9228971962616822
F1 score: 0.9228023764887215
alpha =  0.2
On validation fold
Accuracy: 0.922429906542056
F1 score: 0.9223319480592169
alpha =  0.15
On validation fold
Accuracy: 0.9228971962616822
F1 score: 0.9228023764887215
alpha =  0.12
On validation fold
Accuracy: 0.922429906542056
F1 score: 0.9223419995033246
alpha =  0.17
On validation fold
Accuracy: 0.922429906542056
F1 score: 0.9223319480592169
alpha =  0.15
On test fold
Accuracy: 0.927570093457944
F1 score: 0.9274556439487138


## P2-A: Random Forest

In [213]:
def random_forest(n_trees, min_leaf_samples, validation=True, test=False):
    train_df, validation_df, test_df = data_loader()

    count_vec = CountVectorizer()
    X_train = count_vec.fit_transform(train_df['tweet'])
    y_train = train_df['label']

    X_val = count_vec.transform(validation_df['tweet'])
    y_val = validation_df['label']

    X_test = count_vec.transform(test_df['tweet'])
    y_test = test_df['label']

    for n_tree, min_leaf_sample in (
        (t, l) for t in n_trees for l in min_leaf_samples
    ):
        print('number of trees = %3d\t min_samples_leaf = %d' % (n_tree, min_leaf_sample))

        rf = RandomForestClassifier(
            n_estimators=n_tree, min_samples_leaf=min_leaf_sample
        )
        rf.fit(X_train, y_train)

        if validation:
            y_pred = rf.predict(X_val)
            print('On validation fold')
            print('Accuracy:', accuracy_score(y_val, y_pred))
            print('F1 score:', f1_score(y_val, y_pred, average="macro"))
            print('=============================================')

        if test:
            y_pred = rf.predict(X_test)
            print('On test fold')
            print('Accuracy:', accuracy_score(y_test, y_pred))
            print('F1 score:', f1_score(y_test, y_pred, average="macro"))
            print('=============================================')


random_forest([60, 100, 150, 200], [1, 2, 3])

random_forest([150], [1], False, True)

#####################################################################
#                                                                   #
#                           P2-A                                    #
#                                                                   #
#       ======================================================      #
#           number of trees =  60	 min_samples_leaf = 1           #
#           On validation fold                                      #
#           Accuracy: 0.927570093457944                             #
#           F1 score: 0.9273603241082915                            #
#           =============================================           #
#           number of trees =  60	 min_samples_leaf = 2           #
#           On validation fold                                      #
#           Accuracy: 0.9130841121495327                            #
#           F1 score: 0.9127105263157895                            #
#           =============================================           #
#           number of trees =  60	 min_samples_leaf = 3           #
#           On validation fold                                      #
#           Accuracy: 0.902803738317757                             #
#           F1 score: 0.9024325234274171                            #
#           =============================================           #
#           number of trees = 100	 min_samples_leaf = 1           #
#           On validation fold                                      #
#           Accuracy: 0.9238317757009346                            #
#           F1 score: 0.9236034167113639                            #
#           =============================================           #
#           number of trees = 100	 min_samples_leaf = 2           #
#           On validation fold                                      #
#           Accuracy: 0.9172897196261682                            #
#           F1 score: 0.917006556784382                             #
#           =============================================           #
#           number of trees = 100	 min_samples_leaf = 3           #
#           On validation fold                                      #
#           Accuracy: 0.9107476635514019                            #
#           F1 score: 0.9104221624608068                            #
#           =============================================           #
#           number of trees = 150	 min_samples_leaf = 1           #
#           On validation fold                                      #
#           Accuracy: 0.9280373831775701                            #
#           F1 score: 0.9278027373444482                            #
#           =============================================           #
#           number of trees = 150	 min_samples_leaf = 2           #
#           On validation fold                                      #
#           Accuracy: 0.9130841121495327                            #
#           F1 score: 0.9127316544415562                            #
#           =============================================           #
#           number of trees = 150	 min_samples_leaf = 3           #
#           On validation fold                                      #
#           Accuracy: 0.9070093457943925                            #
#           F1 score: 0.9066909875711413                            #
#           =============================================           #
#           number of trees = 200	 min_samples_leaf = 1           #
#           On validation fold                                      #
#           Accuracy: 0.9238317757009346                            #
#           F1 score: 0.9235793151057372                            #
#           =============================================           #
#           number of trees = 200	 min_samples_leaf = 2           #
#           On validation fold                                      #
#           Accuracy: 0.9196261682242991                            #
#           F1 score: 0.9193554041168883                            #
#           =============================================           #
#           number of trees = 200	 min_samples_leaf = 3           #
#           On validation fold                                      #
#           Accuracy: 0.9088785046728972                            #
#           F1 score: 0.908576481261465                             #
#       ======================================================      #
#       Best validation fold accuracy for                           #
#       number of trees = 150	 min_samples_leaf = 1               #
#       ======================================================      #
#       Accuracy and F1 on test fold for                            #
#       number of trees = 150	 min_samples_leaf = 1               #
#                                                                   #
#               Accuracy: 0.9252336448598131                        #
#               F1 score: 0.9250355774493706                        #
#####################################################################


number of trees =  60	 min_samples_leaf = 1
On validation fold
Accuracy: 0.9285046728971963
F1 score: 0.9283386638319051
number of trees =  60	 min_samples_leaf = 2
On validation fold
Accuracy: 0.9130841121495327
F1 score: 0.9127419851264149
number of trees =  60	 min_samples_leaf = 3
On validation fold
Accuracy: 0.9065420560747663
F1 score: 0.9062373212265561
number of trees = 100	 min_samples_leaf = 1
On validation fold
Accuracy: 0.9252336448598131
F1 score: 0.9250282895589632
number of trees = 100	 min_samples_leaf = 2
On validation fold
Accuracy: 0.9168224299065421
F1 score: 0.9165600630859545
number of trees = 100	 min_samples_leaf = 3
On validation fold
Accuracy: 0.9046728971962616
F1 score: 0.9043306299767533
number of trees = 150	 min_samples_leaf = 1
On validation fold
Accuracy: 0.9299065420560748
F1 score: 0.9296548749829068
number of trees = 150	 min_samples_leaf = 2
On validation fold
Accuracy: 0.9168224299065421
F1 score: 0.9164851316698764
number of trees = 150	 min_sampl

## P2-B: Gradient Boosted Decision Tree

In [218]:
def gradient_boosted_decision_tree(n_trees, min_leaf_samples, learning_rates, validation=True, test=False):
    train_df, validation_df, test_df = data_loader()

    count_vec = CountVectorizer()
    X_train = count_vec.fit_transform(train_df['tweet'])
    y_train = train_df['label']

    X_val = count_vec.transform(validation_df['tweet'])
    y_val = validation_df['label']

    X_test = count_vec.transform(test_df['tweet'])
    y_test = test_df['label']

    for n_tree, min_leaf_sample, learning_rate in (
        (t, l, r) for t in n_trees for l in min_leaf_samples for r in learning_rates
    ):
        print('number of trees = %3d\tmin_samples_leaf = %d\tlearning_rate = %1.1f' % (n_tree, min_leaf_sample, learning_rate))

        gbc = GradientBoostingClassifier(
            n_estimators=n_tree, min_samples_leaf=min_leaf_sample, learning_rate=learning_rate
        )
        gbc.fit(X_train, y_train)

        if validation:
            y_pred = gbc.predict(X_val)
            print('On validation fold')
            print('Accuracy:', accuracy_score(y_val, y_pred))
            print('F1 score:', f1_score(y_val, y_pred, average="macro"))
            print('=============================================')

        if test:
            y_pred = gbc.predict(X_test)
            print('On test fold')
            print('Accuracy:', accuracy_score(y_test, y_pred))
            print('F1 score:', f1_score(y_test, y_pred, average="macro"))
            print('=============================================')

gradient_boosted_decision_tree([100, 150, 200], [1, 2, 3], [1, 0.1, 0.5])

gradient_boosted_decision_tree([200], [3], [0.5], False, True)


#####################################################################
#                                                                   #
#                           P2-A                                    #
#                                                                   #
#       ======================================================      #
# number of trees = 100	min_samples_leaf = 1	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9144859813084112                                      #
# F1 score: 0.9142025439530672                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 1	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9023364485981309                                      #
# F1 score: 0.901969100299204                                       #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 1	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9191588785046729                                      #
# F1 score: 0.9189165097611409                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 2	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9112149532710281                                      #
# F1 score: 0.9109348987996146                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 2	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9014018691588785                                      #
# F1 score: 0.9010079197610369                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 2	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.922429906542056                                       #
# F1 score: 0.9221336485045732                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 3	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9172897196261682                                      #
# F1 score: 0.9169786160428666                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 3	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9004672897196262                                      #
# F1 score: 0.9000696062042695                                      #
# =============================================                     #
# number of trees = 100	min_samples_leaf = 3	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9252336448598131                                      #
# F1 score: 0.9249978095154648                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 1	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9102803738317757                                      #
# F1 score: 0.9099781255258288                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 1	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9088785046728972                                      #
# F1 score: 0.9085564474935854                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 1	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9214953271028037                                      #
# F1 score: 0.9212719123444906                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 2	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9158878504672897                                      #
# F1 score: 0.9156225357048979                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 2	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.908411214953271                                       #
# F1 score: 0.9081026698076169                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 2	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9233644859813084                                      #
# F1 score: 0.923106315553312                                       #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 3	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9186915887850468                                      #
# F1 score: 0.9184087381270207                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 3	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9088785046728972                                      #
# F1 score: 0.9085461868055356                                      #
# =============================================                     #
# number of trees = 150	min_samples_leaf = 3	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9313084112149532                                      #
# F1 score: 0.9310953448288324                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 1	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9163551401869159                                      #
# F1 score: 0.9160595082120606                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 1	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9144859813084112                                      #
# F1 score: 0.9141837430324419                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 1	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9257009345794392                                      #
# F1 score: 0.9254781794914531                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 2	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9177570093457944                                      #
# F1 score: 0.9175311185148596                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 2	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9172897196261682                                      #
# F1 score: 0.9170244464408114                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 2	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9280373831775701                                      #
# F1 score: 0.9277949548488418                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 3	learning_rate = 1.0 #
# On validation fold                                                #
# Accuracy: 0.9219626168224299                                      #
# F1 score: 0.9216868037509167                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 3	learning_rate = 0.1 #
# On validation fold                                                #
# Accuracy: 0.9158878504672897                                      #
# F1 score: 0.9156225357048979                                      #
# =============================================                     #
# number of trees = 200	min_samples_leaf = 3	learning_rate = 0.5 #
# On validation fold                                                #
# Accuracy: 0.9345794392523364                                      #
# F1 score: 0.9344245646106273                                      #
#       ======================================================      #
#       Best validation fold accuracy for                           #
# number of trees = 200	min_samples_leaf = 3	learning_rate = 0.5 #
#       ======================================================      #
#       Accuracy and F1 on test fold for                            #
# number of trees = 200	min_samples_leaf = 3	learning_rate = 0.5 #
#                                                                   #
#               Accuracy: 0.9411214953271028                        #
#               F1 score: 0.9409926470588235                        #
#####################################################################


number of trees = 100	min_samples_leaf = 1	learning_rate = 1.0
On validation fold
Accuracy: 0.9144859813084112
F1 score: 0.9142025439530672
number of trees = 100	min_samples_leaf = 1	learning_rate = 0.1
On validation fold
Accuracy: 0.9023364485981309
F1 score: 0.901969100299204
number of trees = 100	min_samples_leaf = 1	learning_rate = 0.5
On validation fold
Accuracy: 0.9191588785046729
F1 score: 0.9189165097611409
number of trees = 100	min_samples_leaf = 2	learning_rate = 1.0
On validation fold
Accuracy: 0.9112149532710281
F1 score: 0.9109348987996146
number of trees = 100	min_samples_leaf = 2	learning_rate = 0.1
On validation fold
Accuracy: 0.9014018691588785
F1 score: 0.9010079197610369
number of trees = 100	min_samples_leaf = 2	learning_rate = 0.5
On validation fold
Accuracy: 0.922429906542056
F1 score: 0.9221336485045732
number of trees = 100	min_samples_leaf = 3	learning_rate = 1.0
On validation fold
Accuracy: 0.9172897196261682
F1 score: 0.9169786160428666
number of trees = 100	

## P3: SVM

In [224]:
def support_vector_machine(slacks, validation=True, test=False):
    train_df, validation_df, test_df = data_loader()

    count_vec = CountVectorizer()
    X_train = count_vec.fit_transform(train_df['tweet'])
    y_train = train_df['label']

    X_val = count_vec.transform(validation_df['tweet'])
    y_val = validation_df['label']

    X_test = count_vec.transform(test_df['tweet'])
    y_test = test_df['label']

    for slack in slacks:
        print('slack = %.1f' % slack)

        svc = SVC(C=slack)
        svc.fit(X_train, y_train)

        if validation:
            y_pred = svc.predict(X_val)
            print('On validation fold')
            print('Accuracy:', accuracy_score(y_val, y_pred))
            print('F1 score:', f1_score(y_val, y_pred, average="macro"))
            print('=============================================')

        if test:
            y_pred = svc.predict(X_test)
            print('On test fold')
            print('Accuracy:', accuracy_score(y_test, y_pred))
            print('F1 score:', f1_score(y_test, y_pred, average="macro"))
            print('=============================================')

support_vector_machine([0.1, 0.5, 1, 0.8, 2, 3, 5])

support_vector_machine([3], False, True)


#####################################################################
#                                                                   #
#                             P3                                    #
#                                                                   #
#       ======================================================      #
#           slack = 0.1                                             #
#           On validation fold                                      #
#           Accuracy: 0.8607476635514019                            #
#           F1 score: 0.8597775332701836                            #
#           ============================                            #
#           slack = 0.5                                             #
#           On validation fold                                      #
#           Accuracy: 0.9214953271028037                            #
#           F1 score: 0.921247699991238                             #
#           ============================                            #
#           slack = 1.0                                             #
#           On validation fold                                      #
#           Accuracy: 0.9322429906542056                            #
#           F1 score: 0.9320602047740023                            #
#           ============================                            #
#           slack = 0.8                                             #
#           On validation fold                                      #
#           Accuracy: 0.9280373831775701                            #
#           F1 score: 0.927832586315783                             #
#           ============================                            #
#           slack = 2.0                                             #
#           On validation fold                                      #
#           Accuracy: 0.9415887850467289                            #
#           F1 score: 0.941436852836175                             #
#           ============================                            #
#           slack = 3.0                                             #
#           On validation fold                                      #
#           Accuracy: 0.9415887850467289                            #
#           F1 score: 0.9414136631222114                            #
#           ============================                            #
#           slack = 5.0                                             #
#           On validation fold                                      #
#           Accuracy: 0.9397196261682244                            #
#           F1 score: 0.9395510803881291                            #
#       ======================================================      #
#       Best validation fold accuracy for                           #
#                   slack = 3                                       #
#       ======================================================      #
#       Accuracy and F1 on test fold for                            #
#                   slack = 3                                       #
#                                                                   #
#               Accuracy: 0.9401869158878504                        #
#               F1 score: 0.9400226316471705                        #
#####################################################################


slack = 0.1
On validation fold
Accuracy: 0.8607476635514019
F1 score: 0.8597775332701836
slack = 0.5
On validation fold
Accuracy: 0.9214953271028037
F1 score: 0.921247699991238
slack = 1.0
On validation fold
Accuracy: 0.9322429906542056
F1 score: 0.9320602047740023
slack = 0.8
On validation fold
Accuracy: 0.9280373831775701
F1 score: 0.927832586315783
slack = 2.0
On validation fold
Accuracy: 0.9415887850467289
F1 score: 0.941436852836175
slack = 3.0
On validation fold
Accuracy: 0.9415887850467289
F1 score: 0.9414136631222114
slack = 5.0
On validation fold
Accuracy: 0.9397196261682244
F1 score: 0.9395510803881291
slack = 3.0
On test fold
Accuracy: 0.9401869158878504
F1 score: 0.9400226316471705
