# **`CLASSIC SVM BASELINE MODEL WITH PREPROCESSED TWEET`**

In [24]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np
import spacy

nlp = spacy.load("en_core_web_sm")
import scipy.sparse as sp
import warnings
warnings.filterwarnings("ignore")

In [46]:
def identity(x):
    """Dummy function that just returns the input"""
    return x


def tokenizer(tweet):
    doc = nlp(tweet)
    tokens = [word.text for word in doc]
    return tokens


def get_score(classifier, X_test, Y_test, output_file):
    # Given a trained model, predict the label of a new set of data.
    Y_pred = classifier.predict(X_test)
    if output_file:
      if preprocessed:
        pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/outputts_svm.csv')
      else:
        pd.DataFrame(Y_pred).to_csv('/content/gdrive/MyDrive/Data/outputpossvm.csv')
    #calculate the accuracy of the prediction
    acc = accuracy_score(Y_test, Y_pred)
    print(f"Final accuracy: {acc}")
    #prints classification report 
    print(classification_report(Y_test, Y_pred))

    return acc


def optimize_svm(vec, X_train, Y_train, seed, preprocessed):
    if preprocessed:
      print("SVM classification with preprocessed tweet")
    else:
      print("SVM classification with POStagged tweet")

    if vec is None:
        svm_ = svm.SVC(kernel='linear', C=1.14, random_state=seed)
    else:
        svm_ = Pipeline([('vec', vec), ('cls', svm.SVC(kernel='linear', C=1.14, random_state=seed))])

    svm_.fit(X_train, Y_train)

    return svm_


def ensemble(vec, X_train, Y_train, seed):
    print("Ensemble of Naive Bayes, Random Forest and SVM")

    nb = Pipeline([('vec_cn', vec), ('cls', MultinomialNB())])
    rf = Pipeline([('vec_tf', vec), ('cls', RandomForestClassifier(criterion='gini', n_estimators=233, max_depth=10,
                                                                   max_features=0.064, n_jobs=-1, random_state=seed))])
    svm_ = Pipeline([('vec_tf', vec), ('cls', svm.SVC(kernel='linear', C=1.14, random_state=seed))])

    estimators = [('nb', nb), ('rf', rf), ('svm', svm_)]

    ensemble_classifier = VotingClassifier(estimators, voting='hard')
    classifier = ensemble_classifier.fit(X_train, Y_train)

    return classifier

In [40]:
if __name__ == "__main__":

    train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
    val= pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_val.csv')
    test = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_test.csv')
    X_train, Y_train = train['preprocessed'], train['task']
    val_set= False
    output_file = False
    preprocessed = True
    if val_set:
      X_dev, Y_dev = val['preprocessed'], val['task']
    else:
      X_test, Y_test = test['preprocessed'],test['task']
    #Set to use tfidf or bag-of-words vector 
    tfidf = True
    if tfidf:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=tokenizer)
    else:
        # Bag of Words vectorizer
        vec = CountVectorizer(preprocessor=identity, tokenizer=tokenizer)
    #Uncomment the classifier manually to test on ensembled or SVM classifier
    # classifier = ensemble(vec, X_train, Y_train,32)
    classifier = optimize_svm(vec, X_train, Y_train, 32,preprocessed) #SVM
    if val_set:
      get_score(classifier, X_dev, Y_dev, output_file)
    else:
      get_score(classifier, X_test, Y_test, output_file)
   

SVM classification with preprocessed tweet
Final accuracy: 0.7345750873108265
              precision    recall  f1-score   support

         NOT       0.87      0.75      0.80       620
         OFF       0.52      0.70      0.59       239

    accuracy                           0.73       859
   macro avg       0.69      0.72      0.70       859
weighted avg       0.77      0.73      0.74       859



# **`Research Question 2 : Training the POS Tagged tweets and Evaluation of SVM classification`**

In [47]:
if __name__ == "__main__":


    train = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_train.csv')
    val= pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_val.csv')
    test = pd.read_csv('/content/gdrive/MyDrive/Data/preprocessed data/processed_test.csv')
    X_train, Y_train = train['pos_tagged'], train['task']
    val_set= False
    output_file = True
    preprocessed = False
    if val_set:
      X_dev, Y_dev = val['pos_tagged'], val['task']
    else:
      X_test, Y_test = test['pos_tagged'],test['task']
     #Select the vectorizer we want to use
    tfidf = False
    if tfidf:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=tokenizer)
    else:
        # Bag of Words vectorizer
        vec = CountVectorizer(preprocessor=identity, tokenizer=tokenizer)
    #combine vectorizer with classifier
    classifier = optimize_svm(vec, X_train, Y_train, 32,preprocessed) #SVM
    if val_set:
      get_score(classifier, X_dev, Y_dev, output_file)
    else:
      get_score(classifier, X_test, Y_test, output_file)
   

SVM classification with POStagged tweet
Final accuracy: 0.7578579743888242
              precision    recall  f1-score   support

         NOT       0.86      0.80      0.83       620
         OFF       0.56      0.65      0.60       239

    accuracy                           0.76       859
   macro avg       0.71      0.73      0.71       859
weighted avg       0.77      0.76      0.76       859

