In [None]:
# Importing necessaary libraries
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## Twitter

In [None]:
# Load the dataset as an example
df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/clean_twitter.csv')
#rename column
df.rename(columns={"joined_text":"text"},inplace=True)
# X(independent),y(dependent/target variable) features
X = df['text']
y,class_names = pd.factorize(df['label'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the data into 80-20 train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(31234,)
(7809,)
(31234,)
(7809,)


Having found better individual performance with the CountVectorizer in our previous experiments using SVM, Logistic Regression and Naive Bayes models, we have decided to use the CountVectorizer for our ensemble approach as well.

In [None]:
# Apply CountVectorizer to text data
count_vectorizer = CountVectorizer(max_features=5000) # max_features
X_train = count_vectorizer.fit_transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
logreg_classifier = LogisticRegression(max_iter=1000)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating ensemble using voting claasifier
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logreg', logreg_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

In [None]:
# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92      6812
           1       0.48      0.59      0.53       997

    accuracy                           0.87      7809
   macro avg       0.71      0.75      0.73      7809
weighted avg       0.88      0.87      0.87      7809



In [None]:
# Get the confusion matrix
confusion_matrix = confusion_matrix(y_test, ensemble_predictions)
print("Confusion Matrix:\n", confusion_matrix)

Confusion Matrix:
 [[6183  629]
 [ 409  588]]


### SVM+LR

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
logreg_classifier = LogisticRegression(max_iter=1000)

# Creating a bagging ensemble using voting claasifier o
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logreg', logreg_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.91      0.92      6812
           1       0.48      0.58      0.52       997

    accuracy                           0.87      7809
   macro avg       0.71      0.74      0.72      7809
weighted avg       0.88      0.87      0.87      7809



### SVM + NB

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating a bagging ensemble using voting claasifier o
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92      6812
           1       0.46      0.60      0.52       997

    accuracy                           0.86      7809
   macro avg       0.70      0.75      0.72      7809
weighted avg       0.88      0.86      0.87      7809



### LR + NB

In [None]:
# Creating the classifiers
logreg_classifier = LogisticRegression(max_iter=1000)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating a bagging ensemble using voting claasifier o
ensemble_classifier = VotingClassifier(estimators=[
    ('logreg', logreg_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.90      0.92      6812
           1       0.48      0.62      0.54       997

    accuracy                           0.87      7809
   macro avg       0.71      0.76      0.73      7809
weighted avg       0.88      0.87      0.87      7809



## Instagram

In [None]:
# Load the Iris dataset as an example
insta_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/clean_insta.csv')
#rename column
insta_df.rename(columns={"joined_text":"text"},inplace=True)
# Drop rows with NaN values from 'text' column
insta_df.dropna(subset=['text'], inplace=True)
# X(independent),y(dependent/target variable) features
X = insta_df['text']
y,class_names = pd.factorize(insta_df['label'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the data into 80-20 train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(4488,)
(1122,)
(4488,)
(1122,)


Instagram dataset: Having found better individual performance with  TFIDF in our previous experiments using SVM, Logistic Regression and Naive Bayes models, we have decided to use the TF-IDF for our ensemble approach as well.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF vectorization to text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # max_features
X_train  = tfidf_vectorizer.fit_transform(X_train)
X_test  = tfidf_vectorizer.transform(X_test)

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
logreg_classifier = LogisticRegression(max_iter=1000)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating a bagging ensemble using voting claasifier
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logreg', logreg_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

In [None]:
# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       843
           1       0.57      0.61      0.59       279

    accuracy                           0.79      1122
   macro avg       0.72      0.73      0.72      1122
weighted avg       0.79      0.79      0.79      1122



### SVM + LR

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
logreg_classifier = LogisticRegression(max_iter=1000)

# Creating a bagging ensemble using voting claasifier
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('logreg', logreg_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

In [None]:
# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.86       843
           1       0.59      0.58      0.58       279

    accuracy                           0.80      1122
   macro avg       0.73      0.72      0.72      1122
weighted avg       0.79      0.80      0.80      1122



### SVM + NB

In [None]:
# Creating the classifiers
svm_classifier = SVC(kernel='linear', C=1,probability=True)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating a bagging ensemble using voting claasifier
ensemble_classifier = VotingClassifier(estimators=[
    ('svm', svm_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

In [None]:
# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       843
           1       0.57      0.61      0.58       279

    accuracy                           0.79      1122
   macro avg       0.72      0.73      0.72      1122
weighted avg       0.79      0.79      0.79      1122



### LR+NB

In [None]:
# Creating the classifiers
logreg_classifier = LogisticRegression(max_iter=1000)
naive_bayes_classifier = MultinomialNB(alpha=1.0)

# Creating a bagging ensemble using voting claasifier
ensemble_classifier = VotingClassifier(estimators=[
    ('logreg', logreg_classifier),
    ('nb', naive_bayes_classifier)
], voting='soft')  # Using 'soft' voting since it is a probability-based ensemble

# Training the ensemble on the training data
ensemble_classifier.fit(X_train_resampled, y_train_resampled)

In [None]:
# Predicting on the testing data
ensemble_predictions = ensemble_classifier.predict(X_test)

In [None]:
# Get the classification report
class_report = classification_report(y_test, ensemble_predictions)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.83      0.86       843
           1       0.57      0.68      0.62       279

    accuracy                           0.80      1122
   macro avg       0.73      0.76      0.74      1122
weighted avg       0.81      0.80      0.80      1122

