## Ensemble using ML models with less accuracy

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
train_df=pd.read_csv(r'/content/drive/MyDrive/IR_project/train.csv')
test_df=pd.read_csv(r'/content/drive/MyDrive/IR_project/test.csv')

vectoriser = TfidfVectorizer(max_features = 100)
xtr_df = vectoriser.fit_transform(train_df['preprocessed_data'])
xt_df = vectoriser.fit_transform(test_df['preprocessed_data'])

xtr=pd.DataFrame(xtr_df.toarray(),columns=vectoriser.get_feature_names_out())
xt=pd.DataFrame(xt_df.toarray(),columns=vectoriser.get_feature_names_out())
ytr=train_df['label']
yt=test_df['label']


# Feature scaling
scaler = StandardScaler()
scaler.fit(xtr)

xtr = scaler.transform(xtr)
xt = scaler.transform(xt)

In [None]:
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from joblib import load
# Assuming that you have trained models saved in .h5 files
model1 = load('/content/drive/MyDrive/IR_project/models/naive_bayes_model.joblib')
model2 = load('/content/drive/MyDrive/IR_project/models/mlp_model.joblib')
model3 = load('/content/drive/MyDrive/IR_project/models/svc_model.joblib')
model4 = load('/content/drive/MyDrive/IR_project/models/knn_model.joblib')
model5 = load('/content/drive/MyDrive/IR_project/models/random_forest_model.joblib')
model6 = load('/content/drive/MyDrive/IR_project/models/decision_tree_model.joblib')


ensemble_clf = VotingClassifier(estimators=[
    ('naive_bayes', model1),('mlp_model', model2),
    ('knn', model4)
], voting='soft') # hard also giving the same accuracy

ensemble_clf.fit(xtr, ytr)

# Evaluate ensemble performance
y_pred = ensemble_clf.predict(xt)
print('Test Accuracy: ', accuracy_score(yt, y_pred))

print(confusion_matrix(yt, y_pred))
print(classification_report(yt, y_pred))


Test Accuracy:  0.9387583892617449
[[3289  106]
 [ 113   68]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3395
           1       0.39      0.38      0.38       181

    accuracy                           0.94      3576
   macro avg       0.68      0.67      0.68      3576
weighted avg       0.94      0.94      0.94      3576



## Ensemble using Deep learning models

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

train_df=pd.read_csv(r'/content/drive/MyDrive/IR_project/train.csv')
test_df=pd.read_csv(r'/content/drive/MyDrive/IR_project/test.csv')
xtr=train_df['preprocessed_data']
ytr=train_df['label']
xt=test_df['preprocessed_data']
yt=test_df['label']

In [None]:
import tensorflow as tf
import pandas as pd
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder as le
import tensorflow as tf
from sklearn.metrics import confusion_matrix,classification_report
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import layers
from gensim.models import KeyedVectors
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)

        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_heads(concat_attention)
        return output


In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def build_model(max_len, vocab_size, embed_dim, num_heads, ff_dim):
    inputs = layers.Input(shape=(max_len,))
    embedding_layer = TokenAndPositionEmbedding(max_len, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    x = layers.Dense(20, activation="relu")(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(2, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
import pickle
with open('/content/drive/MyDrive/IR_project/models/attention_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer_attention = pickle.load(handle)

with open('/content/drive/MyDrive/IR_project/models/word_level_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer_cnn = pickle.load(handle)

with open('/content/drive/MyDrive/IR_project/models/word_level_tokenizer_bilstm.pickle', 'rb') as handle:
    loaded_tokenizer_bi_lstm_gru = pickle.load(handle)

In [None]:
import numpy as np
from keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your models and tokenizers
cnn_model = load_model('/content/drive/MyDrive/IR_project/models/cnn_wordlevel.h5')
bilstm_gru_model = load_model('/content/drive/MyDrive/IR_project/models/bi_gru_lstm_model_6.h5')
transformer_model = load_model('/content/drive/MyDrive/IR_project/models/attention.h5',custom_objects={'TokenAndPositionEmbedding': TokenAndPositionEmbedding,
                                          'TransformerBlock': TransformerBlock,
                                          'MultiHeadSelfAttention': MultiHeadSelfAttention})

In [None]:
def ensemble_predict(texts, voting):

  preds = []

  # Preprocessing
  cnn_text = loaded_tokenizer_cnn.texts_to_sequences(texts)
  cnn_text = pad_sequences(cnn_text, maxlen=50)  # Adjust maxlen as needed

  bilstm_gru_text = loaded_tokenizer_bi_lstm_gru.texts_to_sequences(texts)
  bilstm_gru_text = pad_sequences(bilstm_gru_text, maxlen=50)  # Adjust maxlen as needed

  transformer_text = loaded_tokenizer_attention.texts_to_sequences(texts)
  transformer_text = pad_sequences(transformer_text, maxlen=512)  # Adjust maxlen as needed

  # Predicting
  cnn_pred = cnn_model.predict(cnn_text)
  bilstm_gru_pred = bilstm_gru_model.predict(bilstm_gru_text)
  transformer_pred = transformer_model.predict(transformer_text)

  # Hard voting
  if voting == 'hard':
      for c, b, t in zip(cnn_pred, bilstm_gru_pred, transformer_pred):
          votes = [np.argmax(c), np.argmax(b), np.argmax(t)]

          counts = np.bincount(votes)

          preds.append(np.argmax(counts))

  # Soft voting
  elif voting == 'soft':
      average_pred = (cnn_pred + bilstm_gru_pred + transformer_pred) / 3
      for x in average_pred:
          preds.append(np.argmax(x))

  return np.array(preds)


In [None]:

# Predict on the test set
y_pred_hard = ensemble_predict(xt, voting='hard')
y_pred_soft = ensemble_predict(xt, voting='soft')

# Print accuracy
print("Hard Voting Accuracy: ", accuracy_score(yt, y_pred_hard))
print("Soft Voting Accuracy: ", accuracy_score(yt, y_pred_soft))

# Print confusion matrix
print("Hard Voting Confusion Matrix: \n", confusion_matrix(yt, y_pred_hard))
print("Soft Voting Confusion Matrix: \n", confusion_matrix(yt, y_pred_soft))

# Print classification report
print("Hard Voting Classification Report: \n", classification_report(yt, y_pred_hard))
print("Soft Voting Classification Report: \n", classification_report(yt, y_pred_soft))

Hard Voting Accuracy:  0.9493847874720358
Soft Voting Accuracy:  0.9807046979865772
Hard Voting Confusion Matrix: 
 [[3395    0]
 [ 181    0]]
Soft Voting Confusion Matrix: 
 [[3369   26]
 [  43  138]]
Hard Voting Classification Report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      3395
           1       0.00      0.00      0.00       181

    accuracy                           0.95      3576
   macro avg       0.47      0.50      0.49      3576
weighted avg       0.90      0.95      0.92      3576

Soft Voting Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3395
           1       0.84      0.76      0.80       181

    accuracy                           0.98      3576
   macro avg       0.91      0.88      0.89      3576
weighted avg       0.98      0.98      0.98      3576



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
