# Classification results for three models
# Image embedding - VGG16
# Text embedding - 1. Word2Vec,  2. TFIDF-LSA,  3. BOW-LSA

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. load image (VGG16, autoencoder output) and text vectors(Glove-twitter-200)


#  training data

In [None]:
img_label=np.load('/content/drive/MyDrive/HMD_project/new/embedding_train_img_norm.npy')
txt_input=np.load('/content/drive/MyDrive/HMD_project/new/twitter_embedding_train_text.npy')

img=img_label[:,0:-1]
label=img_label[:,-1]

img_txt=np.concatenate((img,txt_input),axis=1)

x=img_txt
y=label

print(img.shape)
y.shape


# validation data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)

from sklearn.metrics import matthews_corrcoef


# Decision tree classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))



              precision    recall  f1-score   support

         0.0       0.71      0.63      0.66      1111
         1.0       0.65      0.73      0.69      1069

    accuracy                           0.68      2180
   macro avg       0.68      0.68      0.68      2180
weighted avg       0.68      0.68      0.68      2180

0.3578139793109269


# Logistic regression classifier

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=2500)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.72      0.71      1111
         1.0       0.70      0.67      0.69      1069

    accuracy                           0.70      2180
   macro avg       0.70      0.70      0.70      2180
weighted avg       0.70      0.70      0.70      2180

0.3968624509435295


# Naive bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.64      0.63      1111
         1.0       0.61      0.58      0.59      1069

    accuracy                           0.61      2180
   macro avg       0.61      0.61      0.61      2180
weighted avg       0.61      0.61      0.61      2180

0.22032188916463405


# SVM classifier

In [None]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))


              precision    recall  f1-score   support

         0.0       0.68      0.79      0.73      1111
         1.0       0.74      0.61      0.67      1069

    accuracy                           0.70      2180
   macro avg       0.71      0.70      0.70      2180
weighted avg       0.71      0.70      0.70      2180

0.4054136015525303


# 2. load image (VGG16, autoencoder output) and text vectors(TFIDF-LSA)

In [None]:
img_label=np.load('/content/drive/MyDrive/HMD_project/new/embedding_train_img_norm.npy')
txt_input=np.load('/content/drive/MyDrive/HMD_project/new/lsa_tfidf_train_text.npy')

img=img_label[:,0:-1]
label=img_label[:,-1]
txt=txt_input-np.min(txt_input)

img_txt=np.concatenate((img,txt),axis=1)

x=img_txt
y=label


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)



# Decision tree

In [None]:

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))


              precision    recall  f1-score   support

         0.0       0.70      0.66      0.68      1111
         1.0       0.67      0.71      0.69      1069

    accuracy                           0.68      2180
   macro avg       0.68      0.68      0.68      2180
weighted avg       0.68      0.68      0.68      2180

0.3654909266170182


In [None]:

clf = LogisticRegression(max_iter=2500)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.77      0.75      1111
         1.0       0.74      0.69      0.72      1069

    accuracy                           0.73      2180
   macro avg       0.73      0.73      0.73      2180
weighted avg       0.73      0.73      0.73      2180

0.46447595419142135


In [None]:

clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.64      0.62      1111
         1.0       0.60      0.58      0.59      1069

    accuracy                           0.61      2180
   macro avg       0.61      0.61      0.61      2180
weighted avg       0.61      0.61      0.61      2180

0.21391787410816115


# 3. load image (VGG16, autoencoder output) and text vectors(BOW-LSA)

In [None]:
img_label=np.load('/content/drive/MyDrive/HMD_project/new/embedding_train_img_norm.npy')
txt_input=np.load('/content/drive/MyDrive/HMD_project/new/lsa_bow_train_text.npy')

img=img_label[:,0:-1]
label=img_label[:,-1]
txt=txt_input-np.min(txt_input)

img_txt=np.concatenate((img,txt),axis=1)

x=img_txt
y=label


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=10)



# Decision tree

In [None]:

clf = DecisionTreeClassifier(random_state=20)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.63      0.66      1111
         1.0       0.65      0.72      0.68      1069

    accuracy                           0.67      2180
   macro avg       0.68      0.68      0.67      2180
weighted avg       0.68      0.67      0.67      2180

0.35152451817361346


# Logistic regression

In [None]:
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.73      0.77      0.75      1111
         1.0       0.75      0.70      0.72      1069

    accuracy                           0.74      2180
   macro avg       0.74      0.74      0.74      2180
weighted avg       0.74      0.74      0.74      2180

0.4717629204945636


In [None]:

clf = MultinomialNB()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

print(matthews_corrcoef(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.61      0.64      0.63      1111
         1.0       0.61      0.58      0.59      1069

    accuracy                           0.61      2180
   macro avg       0.61      0.61      0.61      2180
weighted avg       0.61      0.61      0.61      2180

0.2166632248789762
