In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report


In [None]:
df = pd.read_csv("cyber.csv")
print(df.head())

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df = pd.read_csv("cyber.csv")



# Step 2: Remove null values
df = df.dropna(subset=['tweet_text', 'cyberbullying_type'])

# Step 3: Preprocess the tweets
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Tokenizing words
    tokens = word_tokenize(text)

    # Convert words to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove punctuations
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming words
    tokens = [ps.stem(word) for word in tokens]

    return ' '.join(tokens)

df['processed_tweets'] = df['tweet_text'].apply(preprocess_text)

# Step 4: Transform words into vectors using TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['processed_tweets'])
y = df['cyberbullying_type']

# Step 5: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train a classification model (Naive Bayes used as an example)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Step 7: Make predictions and evaluate the model
y_pred = classifier.predict(X_test)


In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
                      precision    recall  f1-score   support

                age       0.78      0.96      0.86      1603
          ethnicity       0.86      0.90      0.88      1603
             gender       0.81      0.82      0.81      1531
  not_cyberbullying       0.67      0.41      0.51      1624
other_cyberbullying       0.62      0.59      0.60      1612
           religion       0.83      0.96      0.89      1566

           accuracy                           0.77      9539
          macro avg       0.76      0.77      0.76      9539
       weighted avg       0.76      0.77      0.76      9539



In [None]:
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Accuracy Score: 0.7725128420169829


In [None]:
# Step 5: Select features and labels
X = df['processed_tweets']
y = df['cyberbullying_type']

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Apply models and generate predictions
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

predictions = {}

for model_name, model in models.items():
    # Transform words into vectors using TF-IDF Vectorizer
    X_train_vectorized = tfidf_vectorizer.transform(X_train)
    X_test_vectorized = tfidf_vectorizer.transform(X_test)

    # Train the model
    model.fit(X_train_vectorized, y_train)

    # Make predictions
    y_pred = model.predict(X_test_vectorized)

    # Save predictions
    predictions[model_name] = y_pred


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print(f"\nModel: {model_name}")



Model: Random Forest


In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Confusion Matrix:
 [[1558    0    5   22   16    2]
 [   2 1575    2    8   15    1]
 [   1    2 1287  106  133    2]
 [  22    5   59  840  639   59]
 [   9    6   98  434 1058    7]
 [   0    4    6   24   31 1501]]


In [None]:
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Classification Report:
                      precision    recall  f1-score   support

                age       0.98      0.97      0.98      1603
          ethnicity       0.99      0.98      0.99      1603
             gender       0.88      0.84      0.86      1531
  not_cyberbullying       0.59      0.52      0.55      1624
other_cyberbullying       0.56      0.66      0.60      1612
           religion       0.95      0.96      0.96      1566

           accuracy                           0.82      9539
          macro avg       0.83      0.82      0.82      9539
       weighted avg       0.82      0.82      0.82      9539



In [None]:
# Step 10: Report the model with the best accuracy
best_model_name = max(models, key=lambda model: accuracy_score(y_test, predictions[model]))
best_model = models[best_model_name]

In [None]:
# Apply Logistic Regression model
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_vectorized, y_train)
lr_predictions = lr_classifier.predict(X_test_vectorized)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print("\nLogistic Regression:")
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))
print("\nClassification Report:\n", classification_report(y_test, lr_predictions))


Logistic Regression:
Confusion Matrix:
 [[1550    0    6   27   20    0]
 [   3 1555    3   12   25    5]
 [   2    4 1278  123  120    4]
 [  51    7   66  919  530   51]
 [   8    7   70  384 1124   19]
 [   2    2    4   67   21 1470]]

Classification Report:
                      precision    recall  f1-score   support

                age       0.96      0.97      0.96      1603
          ethnicity       0.99      0.97      0.98      1603
             gender       0.90      0.83      0.86      1531
  not_cyberbullying       0.60      0.57      0.58      1624
other_cyberbullying       0.61      0.70      0.65      1612
           religion       0.95      0.94      0.94      1566

           accuracy                           0.83      9539
          macro avg       0.83      0.83      0.83      9539
       weighted avg       0.83      0.83      0.83      9539



In [None]:
print(f"\nBest Model: {best_model_name}")
print("Accuracy Score:", accuracy_score(y_test, predictions[best_model_name]))



Best Model: Logistic Regression
Accuracy Score: 0.8277597232414299


In [None]:
nb_classifier = MultinomialNB()
X_train_vectorized = tfidf_vectorizer.transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)
nb_classifier.fit(X_train_vectorized, y_train)
nb_predictions = nb_classifier.predict(X_test_vectorized)

In [None]:
print("\nMultinomial Naive Bayes:")
print("Confusion Matrix:\n", confusion_matrix(y_test, nb_predictions))
print("\nClassification Report:\n", classification_report(y_test, nb_predictions))


Multinomial Naive Bayes:
Confusion Matrix:
 [[1545    6    5   17   20   10]
 [  45 1449   17    9   31   52]
 [  24   38 1250  104   97   18]
 [ 183   76  116  673  425  151]
 [ 163  106  148  180  944   71]
 [  12    6   14   21    5 1508]]

Classification Report:
                      precision    recall  f1-score   support

                age       0.78      0.96      0.86      1603
          ethnicity       0.86      0.90      0.88      1603
             gender       0.81      0.82      0.81      1531
  not_cyberbullying       0.67      0.41      0.51      1624
other_cyberbullying       0.62      0.59      0.60      1612
           religion       0.83      0.96      0.89      1566

           accuracy                           0.77      9539
          macro avg       0.76      0.77      0.76      9539
       weighted avg       0.76      0.77      0.76      9539



In [None]:
# Apply Decision Tree model
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_vectorized, y_train)
dt_predictions = dt_classifier.predict(X_test_vectorized)

In [None]:
print("\nDecision Tree:")
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_predictions))
print("\nClassification Report:\n", classification_report(y_test, dt_predictions))


Decision Tree:
Confusion Matrix:
 [[1536    3    7   38   17    2]
 [   5 1547   10   20   17    4]
 [   1    1 1285  123  116    5]
 [  21    6   88  833  612   64]
 [  12    8  118  604  856   14]
 [   2   11    9   54   35 1455]]

Classification Report:
                      precision    recall  f1-score   support

                age       0.97      0.96      0.97      1603
          ethnicity       0.98      0.97      0.97      1603
             gender       0.85      0.84      0.84      1531
  not_cyberbullying       0.50      0.51      0.51      1624
other_cyberbullying       0.52      0.53      0.52      1612
           religion       0.94      0.93      0.94      1566

           accuracy                           0.79      9539
          macro avg       0.79      0.79      0.79      9539
       weighted avg       0.79      0.79      0.79      9539



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Apply Random Forest model
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_vectorized, y_train)
rf_predictions = rf_classifier.predict(X_test_vectorized)

# Compute confusion matrix and classification report for Random Forest
print("\nRandom Forest:")
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("\nClassification Report:\n", classification_report(y_test, rf_predictions))

# Calculate accuracy scores for each model
rf_accuracy = accuracy_score(y_test, rf_predictions)

# Print accuracy scores
print(f"\nAccuracy Scores:")
print(f"Multinomial Naive Bayes: {nb_accuracy}")
print(f"Logistic Regression: {lr_accuracy}")
print(f"Decision Tree: {dt_accuracy}")
print(f"Random Forest: {rf_accuracy}")

# Report the model with the best accuracy
best_model = max([('Multinomial Naive Bayes', nb_accuracy),
                  ('Logistic Regression', lr_accuracy),
                  ('Decision Tree', dt_accuracy),
                  ('Random Forest', rf_accuracy)],
                 key=lambda x: x[1])

print(f"\nModel with the Best Accuracy: {best_model[0]}")
print(f"Accuracy Score: {best_model[1]}")
