## Getting Started

In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

## Data and Modeling

In [3]:
def data_process(name):
  dataset = pd.read_csv('/content/gdrive/MyDrive/Master_Thesis/vaccine_detection/dataset/train_data.csv')
  if name == 'MultiPlatform':
        data = dataset.sample(frac=1, random_state=42)
        data.reset_index(inplace=True, drop=True)
  else:
    data_pf = dataset[dataset.platform==name]
    data = data_pf.sample(frac=1, random_state=42)
    data.reset_index(inplace=True, drop=True)
    
  trainx, testx, trainy, testy = train_test_split(data['comments'],data['sentiment'], stratify=data['sentiment'], test_size=0.2)
  return trainx, testx, trainy, testy

In [4]:
def train_evaluate_model(name):
  # load dataset
  trainx, testx, trainy, testy = data_process(name)
  tfidf = TfidfVectorizer(stop_words='english')
  train_x_vector = tfidf.fit_transform(trainx)
  svc = SVC(kernel='linear')
  svc.fit(train_x_vector, trainy) 
  test_x_vector = tfidf.transform(testx)
  print('Accuracy Score')
  print(svc.score(test_x_vector, testy))
  pred = svc.predict(test_x_vector)
  print("\nClassification Report")
  print(classification_report(testy,pred ))
  test_data_path = '/content/gdrive/MyDrive/Master_Thesis/vaccine_detection/dataset/test_data.csv'
  test_data = pd.read_csv(test_data_path)
  cross_x = test_data['comments'].to_numpy()
  cross_y = test_data['sentiment'].to_list()
  cross_x_vector = tfidf.transform(cross_x)
  cross_pred = svc.predict(cross_x_vector)
  print('Cross platform Accuracy Score')
  print(svc.score(cross_x_vector, cross_y))
  print("\nClassification Report for Cross Platform")
  print(classification_report(cross_y,cross_pred ))

## Facebook Model

In [5]:
train_evaluate_model('Facebook')

Accuracy Score
0.8991673605328893

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      6553
           1       0.91      0.86      0.89      5457

    accuracy                           0.90     12010
   macro avg       0.90      0.90      0.90     12010
weighted avg       0.90      0.90      0.90     12010

Cross platform Accuracy Score
0.6784437652178418

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.82      0.55      0.66     10822
           1       0.59      0.84      0.70      8481

    accuracy                           0.68     19303
   macro avg       0.71      0.70      0.68     19303
weighted avg       0.72      0.68      0.67     19303



## Twitter Model

In [6]:
train_evaluate_model('Twitter')

Accuracy Score
0.9133786848072563

Classification Report
              precision    recall  f1-score   support

           0       0.92      0.97      0.94      4770
           1       0.90      0.77      0.83      1845

    accuracy                           0.91      6615
   macro avg       0.91      0.87      0.89      6615
weighted avg       0.91      0.91      0.91      6615

Cross platform Accuracy Score
0.6757498834378076

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.66      0.87      0.75     10822
           1       0.72      0.43      0.54      8481

    accuracy                           0.68     19303
   macro avg       0.69      0.65      0.64     19303
weighted avg       0.69      0.68      0.66     19303



## Reddit Model

In [7]:
train_evaluate_model('Reddit')

Accuracy Score
0.973388747596303

Classification Report
              precision    recall  f1-score   support

           0       0.98      0.97      0.97      7955
           1       0.97      0.98      0.97      8166

    accuracy                           0.97     16121
   macro avg       0.97      0.97      0.97     16121
weighted avg       0.97      0.97      0.97     16121

Cross platform Accuracy Score
0.8072838418898617

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.78      0.91      0.84     10822
           1       0.85      0.68      0.76      8481

    accuracy                           0.81     19303
   macro avg       0.82      0.79      0.80     19303
weighted avg       0.81      0.81      0.80     19303



## Multi-Model

In [8]:
train_evaluate_model('MultiPlatform')

Accuracy Score
0.9080128943126871

Classification Report
              precision    recall  f1-score   support

           0       0.91      0.93      0.92     19277
           1       0.91      0.88      0.90     15467

    accuracy                           0.91     34744
   macro avg       0.91      0.91      0.91     34744
weighted avg       0.91      0.91      0.91     34744

Cross platform Accuracy Score
0.9016215096099052

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     10822
           1       0.90      0.87      0.89      8481

    accuracy                           0.90     19303
   macro avg       0.90      0.90      0.90     19303
weighted avg       0.90      0.90      0.90     19303

