# Machine Learning

# Bag of Words Extraction

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Load your dataset
twitter_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/clean_twitter.csv')
# Load your dataset
insta_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/clean_insta.csv')
#rename column
twitter_df.rename(columns={"joined_text":"text"},inplace=True)
#rename column
insta_df.rename(columns={"joined_text":"text"},inplace=True)

In [None]:
# Drop rows with NaN values from 'text' column
insta_df.dropna(subset=['text'], inplace=True)

## Creating training datasets and test sets for each platform dataset

In [None]:
# Split the data into training and testing sets
train_twitter_df, test_twitter_df = train_test_split(twitter_df, test_size=0.2, random_state=42)

# Save the subsets as CSV files
train_twitter_df.to_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/twitter_train.csv', index=False)
test_twitter_df.to_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/twitter_test.csv', index=False)

In [None]:
# Split the data into training and testing sets
train_insta_df, test_insta_df = train_test_split(insta_df, test_size=0.2, random_state=42)

# Save the subsets as CSV files
train_insta_df.to_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/insta_train.csv', index=False)
test_insta_df.to_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/insta_test.csv', index=False)

In [None]:
twitter_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/twitter_train.csv')
insta_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/insta_train.csv')

Subsample the Twitter Dataset: Randomly sample a subset of the Twitter dataset to match the size of the Instagram dataset. This ensures that both datasets contribute equally to the combined dataset.

In [None]:
twitter_subset = twitter_df.sample(n=len(insta_df), random_state=42)
combined_df = pd.concat([twitter_subset, insta_df], ignore_index=True)

## SVM Count Vectorizer

In [None]:
X = combined_df['text']
# #encode for train
y,class_names = pd.factorize(combined_df['label'])

# Splitting the data into 80-20 train-test split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)



(7180,)
(1796,)
(7180,)
(1796,)


In [None]:
# Display the class names and their corresponding codes
for code, class_name in enumerate(class_names):
    print(f"Code {code} represents class '{class_name}'")


Code 0 represents class 'cyberbullying'
Code 1 represents class 'non-cyberbullying'


In [None]:
# Apply CountVectorizer to text data
count_vectorizer = CountVectorizer(max_features=5000) # max_features
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_counts, y_train)

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_resampled, y_train_resampled)
svm_predictions = svm_model.predict(X_test_counts)

print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, svm_predictions))

Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.80      0.81       974
           1       0.77      0.81      0.79       822

    accuracy                           0.80      1796
   macro avg       0.80      0.80      0.80      1796
weighted avg       0.80      0.80      0.80      1796



### Testing SVM on Individual platform datasets

In [None]:
# Load separate test datasets for Instagram and Twitter
insta_test_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/insta_test.csv')
twitter_test_df = pd.read_csv('/content/drive/MyDrive/MSC Data science/Thesis/Final data/twitter_test.csv')

# Mapping between class names and their corresponding integer labels
class_mapping = {'cyberbullying': 0, 'non-cyberbullying': 1}

# Convert labels to integers using the mapping
insta_test_df['label'] = insta_test_df['label'].map(class_mapping)
twitter_test_df['label'] = twitter_test_df['label'].map(class_mapping)

In [None]:
# Vectorization
insta_test_vectorized = count_vectorizer.transform(insta_test_df['text'])
twitter_test_vectorized = count_vectorizer.transform(twitter_test_df['text'])

# Predict on test data
insta_test_predictions = svm_model.predict(insta_test_vectorized)
twitter_test_predictions = svm_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)


Instagram Test Report:
               precision    recall  f1-score   support

           0       0.50      0.53      0.51       279
           1       0.84      0.83      0.83       843

    accuracy                           0.75      1122
   macro avg       0.67      0.68      0.67      1122
weighted avg       0.76      0.75      0.75      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.94      0.85      0.89      6812
           1       0.39      0.66      0.49       997

    accuracy                           0.82      7809
   macro avg       0.67      0.75      0.69      7809
weighted avg       0.87      0.82      0.84      7809



## Logistic Regression Count Vectorizer

In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, C=1)
logreg_model.fit(X_train_resampled, y_train_resampled)
logreg_predictions = logreg_model.predict(X_test_counts)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.80      0.83       974
           1       0.78      0.84      0.81       822

    accuracy                           0.82      1796
   macro avg       0.82      0.82      0.82      1796
weighted avg       0.82      0.82      0.82      1796



### Testing Log Regression on Individual platform datasets

In [None]:
# Predict on test data
insta_test_predictions = logreg_model.predict(insta_test_vectorized)
twitter_test_predictions = logreg_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)

Instagram Test Report:
               precision    recall  f1-score   support

           0       0.55      0.50      0.52       279
           1       0.84      0.86      0.85       843

    accuracy                           0.77      1122
   macro avg       0.69      0.68      0.69      1122
weighted avg       0.77      0.77      0.77      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.95      0.85      0.90      6812
           1       0.41      0.70      0.52       997

    accuracy                           0.83      7809
   macro avg       0.68      0.78      0.71      7809
weighted avg       0.88      0.83      0.85      7809



## Naive Bayes Count Vectorizer

In [None]:
# Naive Bayes
naive_bayes_model = MultinomialNB(alpha=1.0)
naive_bayes_model.fit(X_train_resampled, y_train_resampled)
naive_bayes_predictions = naive_bayes_model.predict(X_test_counts)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, naive_bayes_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       974
           1       0.76      0.79      0.77       822

    accuracy                           0.79      1796
   macro avg       0.79      0.79      0.79      1796
weighted avg       0.79      0.79      0.79      1796



### Testing Naive Bayes on Individual Pltaform datasets

In [None]:
# Predict on test data
insta_test_predictions = naive_bayes_model.predict(insta_test_vectorized)
twitter_test_predictions = naive_bayes_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)

Instagram Test Report:
               precision    recall  f1-score   support

           0       0.47      0.51      0.49       279
           1       0.83      0.81      0.82       843

    accuracy                           0.74      1122
   macro avg       0.65      0.66      0.66      1122
weighted avg       0.74      0.74      0.74      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.94      0.87      0.90      6812
           1       0.41      0.60      0.48       997

    accuracy                           0.84      7809
   macro avg       0.67      0.74      0.69      7809
weighted avg       0.87      0.84      0.85      7809



## TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply TF-IDF vectorization to text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # max_features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

## SVM TFIDF MODEL

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_resampled, y_train_resampled)
svm_predictions = svm_model.predict(X_test_tfidf)

print("Support Vector Machine (SVM) Classification Report:")
print(classification_report(y_test, svm_predictions))

Support Vector Machine (SVM) Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       974
           1       0.77      0.85      0.80       822

    accuracy                           0.81      1796
   macro avg       0.81      0.81      0.81      1796
weighted avg       0.82      0.81      0.81      1796



### Testing SVM on platform datasets

In [None]:
# Predict on test data
insta_test_predictions = svm_model.predict(insta_test_vectorized)
twitter_test_predictions = svm_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)

Instagram Test Report:
               precision    recall  f1-score   support

           0       0.42      0.75      0.53       279
           1       0.89      0.65      0.75       843

    accuracy                           0.68      1122
   macro avg       0.65      0.70      0.64      1122
weighted avg       0.77      0.68      0.70      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      6812
           1       0.48      0.41      0.44       997

    accuracy                           0.87      7809
   macro avg       0.70      0.67      0.68      7809
weighted avg       0.86      0.87      0.86      7809



## Log Reg TFIDF

In [None]:
# Logistic Regression
logreg_model = LogisticRegression(max_iter=1000, C=1)
logreg_model.fit(X_train_resampled, y_train_resampled)
logreg_predictions = logreg_model.predict(X_test_tfidf)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, logreg_predictions))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.83       974
           1       0.77      0.85      0.81       822

    accuracy                           0.82      1796
   macro avg       0.82      0.82      0.82      1796
weighted avg       0.82      0.82      0.82      1796



### Testing Log Reg on Individual platform datasets

In [None]:
# Predict on test data
insta_test_predictions = logreg_model.predict(insta_test_vectorized)
twitter_test_predictions = logreg_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)

Instagram Test Report:
               precision    recall  f1-score   support

           0       0.41      0.77      0.54       279
           1       0.90      0.64      0.74       843

    accuracy                           0.67      1122
   macro avg       0.65      0.71      0.64      1122
weighted avg       0.78      0.67      0.69      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.93      6812
           1       0.49      0.39      0.44       997

    accuracy                           0.87      7809
   macro avg       0.70      0.67      0.68      7809
weighted avg       0.86      0.87      0.86      7809



## Naive Bayes TFIDF

In [None]:
# Naive Bayes
naive_bayes_model = MultinomialNB(alpha=1.0)
naive_bayes_model.fit(X_train_resampled, y_train_resampled)
naive_bayes_predictions = naive_bayes_model.predict(X_test_tfidf)

print("Naive Bayes Classification Report:")
print(classification_report(y_test, naive_bayes_predictions))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.82       974
           1       0.80      0.74      0.77       822

    accuracy                           0.80      1796
   macro avg       0.80      0.79      0.79      1796
weighted avg       0.80      0.80      0.79      1796



### Testing Naive Bayes on Individual platform datasets

In [None]:
# Predict on test data
insta_test_predictions = naive_bayes_model.predict(insta_test_vectorized)
twitter_test_predictions = naive_bayes_model.predict(twitter_test_vectorized)

# Evaluation
insta_report = classification_report(insta_test_df['label'], insta_test_predictions)
twitter_report = classification_report(twitter_test_df['label'], twitter_test_predictions)

print("Instagram Test Report:\n", insta_report)
print("Twitter Test Report:\n", twitter_report)

Instagram Test Report:
               precision    recall  f1-score   support

           0       0.46      0.63      0.53       279
           1       0.86      0.75      0.80       843

    accuracy                           0.72      1122
   macro avg       0.66      0.69      0.67      1122
weighted avg       0.76      0.72      0.74      1122

Twitter Test Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.91      6812
           1       0.43      0.51      0.46       997

    accuracy                           0.85      7809
   macro avg       0.68      0.70      0.69      7809
weighted avg       0.86      0.85      0.86      7809

