## Getting Started

In [14]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [15]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


## Data and Modeling

In [16]:
def data_process(name):
  dataset = pd.read_csv('/content/gdrive/MyDrive/Master_Thesis/vaccine_detection/dataset/train_data.csv')
  if name == 'MultiPlatform':
        data = dataset.sample(frac=1, random_state=42)
        data.reset_index(inplace=True, drop=True)
  else:
    data_pf = dataset[dataset.platform==name]
    data = data_pf.sample(frac=1, random_state=42)
    data.reset_index(inplace=True, drop=True)
    
  trainx, testx, trainy, testy = train_test_split(data['comments'],data['sentiment'], stratify=data['sentiment'], test_size=0.2)
  return trainx, testx, trainy, testy

In [17]:
def train_evaluate_model(name):
  # load dataset
  trainx, testx, trainy, testy = data_process(name)
  tfidf = TfidfVectorizer(stop_words='english')
  train_x_vector = tfidf.fit_transform(trainx)
  lr = LogisticRegression(max_iter=500)
  lr.fit(train_x_vector, trainy) 
  test_x_vector = tfidf.transform(testx)
  print('Accuracy Score')
  print(lr.score(test_x_vector, testy))
  pred = lr.predict(test_x_vector)
  print("\nClassification Report")
  print(classification_report(testy,pred ))
  test_data_path = '/content/gdrive/MyDrive/Master_Thesis/vaccine_detection/dataset/test_data.csv'
  test_data = pd.read_csv(test_data_path)
  cross_x = test_data['comments'].to_numpy()
  cross_y = test_data['sentiment'].to_list()
  cross_x_vector = tfidf.transform(cross_x)
  cross_pred = lr.predict(cross_x_vector)
  print('Cross platform Accuracy Score')
  print(lr.score(cross_x_vector, cross_y))
  print("\nClassification Report for Cross Platform")
  print(classification_report(cross_y,cross_pred ))

## Facebook Model

In [18]:
train_evaluate_model('Facebook')

Accuracy Score
0.8926727726894255

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      6553
           1       0.92      0.84      0.88      5457

    accuracy                           0.89     12010
   macro avg       0.90      0.89      0.89     12010
weighted avg       0.89      0.89      0.89     12010

Cross platform Accuracy Score
0.6726933637258458

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.81      0.54      0.65     10822
           1       0.59      0.84      0.69      8481

    accuracy                           0.67     19303
   macro avg       0.70      0.69      0.67     19303
weighted avg       0.71      0.67      0.67     19303



## Twitter Model

In [19]:
train_evaluate_model('Twitter')

Accuracy Score
0.909448223733938

Classification Report
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      4770
           1       0.91      0.75      0.82      1845

    accuracy                           0.91      6615
   macro avg       0.91      0.86      0.88      6615
weighted avg       0.91      0.91      0.91      6615

Cross platform Accuracy Score
0.6866808268144848

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.67      0.89      0.76     10822
           1       0.75      0.43      0.55      8481

    accuracy                           0.69     19303
   macro avg       0.71      0.66      0.65     19303
weighted avg       0.70      0.69      0.67     19303



## Reddit Model

In [20]:
train_evaluate_model('Reddit')

Accuracy Score
0.9686123689597419

Classification Report
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      7955
           1       0.97      0.97      0.97      8166

    accuracy                           0.97     16121
   macro avg       0.97      0.97      0.97     16121
weighted avg       0.97      0.97      0.97     16121

Cross platform Accuracy Score
0.8067657877013935

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.78      0.92      0.84     10822
           1       0.87      0.66      0.75      8481

    accuracy                           0.81     19303
   macro avg       0.82      0.79      0.80     19303
weighted avg       0.82      0.81      0.80     19303



## Multi-Model

In [21]:
train_evaluate_model('MultiPlatform')

Accuracy Score
0.9015657379691457

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     19277
           1       0.91      0.87      0.89     15467

    accuracy                           0.90     34744
   macro avg       0.90      0.90      0.90     34744
weighted avg       0.90      0.90      0.90     34744

Cross platform Accuracy Score
0.8991348495052582

Classification Report for Cross Platform
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     10822
           1       0.90      0.86      0.88      8481

    accuracy                           0.90     19303
   macro avg       0.90      0.90      0.90     19303
weighted avg       0.90      0.90      0.90     19303

