# Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Importing Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from time import time

#Loading Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/dataset/public_data_labeled.csv')

#Pre-processing Dataset

In [None]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
df['label']= label_encoder.fit_transform(df['label'])

df['label'].unique()

In [None]:
import string
from nltk import pos_tag#pos_tag is a tool that tags the part of speech to the word(POS = Part of Speech)
#example: tagging the word 'drinking' as verb

#function for removing punctuations
def tokenize_remove_punctuation(text):
  clean_text = []         #creaating an empty list to store the cleaned text
  text = text.split(" ")  #spliting all words in a sentence separated
                          #by " " and storing them in a list named 'text'
  for word in text:
    word = list(word)  #spliting all words into alphabets
    new_word = []      #creaating an empty list to store the new word after
                       #removing puntuations

    # spliting the words into alphabets is used because it
    #will convert words like 'reading?' into 'reading'
    for c in word:
      if c not in string.punctuation:     #string.puntuation is a list og all
                                          # puntuation marks , example :@!$%&?, etc.
        new_word.append(c)
      word = "".join(new_word)  #joing the alphabets to create the word after removing all puntuations
    clean_text.append(word)     #storing the word in the list named 'clean_text' to create the
                                # list of words in the sentence
  return clean_text

In [None]:
import nltk
#downloads the list of stopwords
nltk.download('stopwords')

In [None]:
#storing all the stopwords in the list named 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')  #storing only english stopwords , there are stopwords for other language also such as chinese and french

# Function to remove all the stopwords from the sentence
def remove_stopwords(text):
  clean_text = []
  for word in text:
    if word not in stopwords:
      clean_text.append(word)
  return clean_text

In [None]:
#tagging all the words according o their part of speech
def pos_tagging(text):
    try:
        tagged = nltk.pos_tag(text)
        return tagged
    except Excepton as e:
        print(e)

In [None]:
from nltk.corpus import wordnet

#wordnet is a tool that reads that reads the tagging and returns the part of speech
def get_wordnet(pos_tag):
  if pos_tag.startswith('J'):
    return wordnet.ADJ
  elif pos_tag.startswith('V'):
    return wordnet.VERB
  elif pos_tag.startswith('N'):
    return wordnet.NOUN
  elif pos_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [None]:
from nltk.stem import WordNetLemmatizer
#WordLemmatizer is a tool that converts word into root word
#Example: historical(word) is converted into history(root-word)

#Now we will create a function that uses all the functions that we have created above

def clean_text(text):
  text = str(text)
  #Converting text to lower-case
  text = text.lower()
  #tokenize and remove punctuations from the text
  text = tokenize_remove_punctuation(text)
  #remove words containing numericals
  text = [word for word in text if not any(c.isdigit() for c in word)]
  #remove stopwords
  text = remove_stopwords(text)
  #remove empty tokens
  text = [ t for t in text if len(t) > 0]
  #pos tagging
  pos_tags = pos_tagging(text)
  #Lemmatize text
  text = [WordNetLemmatizer().lemmatize(t[0],get_wordnet(t[1])) for t in pos_tags]
  #remove words with only one letter
  text = [ t for t in text if len(t)>1]
  #join all words
  text = " ".join(text)
  return text


In [None]:
#don't know what is 'averaged_perceptron_tagger'
#don't know why devansh downloaded it
nltk.download('averaged_perceptron_tagger')
#Downloading the wordnet tool
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
#reseting the index of rows
# Index gets unordered if we drop some rows in our dataset(Example while using dropna() function)
df.reset_index(inplace = True, drop = True)

In [None]:
#this line of code will now be used to pply the functions on each sentences in the 'comments' column
#This will take time as it will use the 'clean_text' function on all the sentences in our dataset
#the .map() function applies the function at each sentences in the 'comments' column
import nltk
nltk.download('omw-1.4')
df['Processed_Comment'] = df['full_text'].map(clean_text)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['Processed_Comment'],
                                                    df['label'],
                                                    random_state=42)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 24783
Number of rows in the training set: 18587
Number of rows in the test set: 6196


In [None]:
 #Instantiate the CountVectorizer method
count_vector = CountVectorizer(stop_words = 'english', lowercase = True)

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

#Random Forest Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


# Preprocess the data
X = df['Processed_Comment']
y = df['label']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform PCA for feature selection
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train.toarray())
X_test_pca = pca.transform(X_test.toarray())

# Train the Random Forest model
randomforest = RandomForestClassifier(n_estimators=1000,min_samples_split=2)
randomforest.fit(X_train_pca, y_train)

# Evaluate the model on the test set
y_pred = randomforest.predict(X_test_pca)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9205164413960056
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.77      0.77       835
           1       0.95      0.95      0.95      4122

    accuracy                           0.92      4957
   macro avg       0.86      0.86      0.86      4957
weighted avg       0.92      0.92      0.92      4957



In [None]:
import pickle
# Store the trained classifier to a file
with open('/content/drive/MyDrive/dataset/model/randomforest_final.pickle', 'wb') as f:
    pickle.dump(randomforest, f)

In [None]:
# Load the stored classifier from a file
import pickle
with open('/content/drive/MyDrive/dataset/model/randomforest_final.pickle', 'rb') as f:
    randomforest = pickle.load(f)

#Perceptron Model

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

# Preprocess the data
X = df['Processed_Comment']
y = df['label']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Perform feature selection using SelectKBest
selector = SelectKBest(chi2, k=500)
X = selector.fit_transform(X, y)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Perceptron model
per = Perceptron(max_iter=100,penalty='l2',alpha=0.0001)
per.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = per.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9017550938067379
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.73       835
           1       0.95      0.93      0.94      4122

    accuracy                           0.90      4957
   macro avg       0.82      0.85      0.83      4957
weighted avg       0.91      0.90      0.90      4957



In [None]:
import pickle
# Store the trained classifier to a file
with open('/content/drive/MyDrive/dataset/model/perceptron_final.pickle', 'wb') as f:
    pickle.dump(per, f)

In [None]:
import pickle
# Load the stored classifier from a file
with open('/content/drive/MyDrive/dataset/model/perceptron_final.pickle', 'rb') as f:
    per = pickle.load(f)

In [None]:
y_pred = per.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9017550938067379
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.78      0.73       835
           1       0.95      0.93      0.94      4122

    accuracy                           0.90      4957
   macro avg       0.82      0.85      0.83      4957
weighted avg       0.91      0.90      0.90      4957



# Ensemble Model (Random Forest With Perceptron)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the pre-trained Perceptron and Random Forest models
perceptron_model = joblib.load('/content/drive/MyDrive/dataset/model/perceptron_final.pickle')
random_forest_model = joblib.load('/content/drive/MyDrive/dataset/model/randomforest_final.pickle')

# Create a list of base estimators
base_estimators = [perceptron_model, random_forest_model]

# Create the ensemble classifier using BaggingClassifier with two base models
ensemble_classifier = BaggingClassifier(
    base_estimator=None,  # Setting base_estimator as None since we're passing a list of base estimators
    n_estimators=2,  # Number of base estimators (two in this case)
    random_state=42
)

# Set the base estimators for the ensemble classifier
ensemble_classifier.estimators_ = base_estimators

# Convert X_train and X_test from Series to DataFrame
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # Include unigrams and bigrams

# Fit and transform the training data
X_train_transformed = vectorizer.fit_transform(X_train.values.ravel())

# Transform the test data
X_test_transformed = vectorizer.transform(X_test.values.ravel())

# Train the ensemble classifier on the transformed data
ensemble_classifier.fit(X_train_transformed, y_train)

# Use the ensemble classifier for prediction
prediction = ensemble_classifier.predict(X_test_transformed)

# Evaluate the ensemble classifier
accuracy = ensemble_classifier.score(X_test_transformed, y_test)

# Print the accuracy
print('Accuracy:', accuracy)
print(classification_report(y_test, prediction))

Accuracy: 0.9307617817947063
              precision    recall  f1-score   support

           0       0.78      0.81      0.80      1041
           1       0.96      0.95      0.96      5155

    accuracy                           0.93      6196
   macro avg       0.87      0.88      0.88      6196
weighted avg       0.93      0.93      0.93      6196



# LSTM Model


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
# Convert the text data to sequences
sequences = tokenizer.texts_to_sequences(df['full_text'])
# Pad the sequences to a fixed length
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
# Build the model
model = Sequential([
    Embedding(5000, 64, input_length=max_length),
    LSTM(64),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

# Glove Embeddings

In [None]:
import gensim.downloader as api
# Download the GloVe model
glove = api.load("glove-twitter-100")

# LSTM With Glove Model

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
# Download the GloVe model
glove = api.load("glove-twitter-100")
# Load the data
df = pd.read_csv('/content/drive/MyDrive/text-dataset/public_data_labeled.csv')
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['full_text'])
word_index = tokenizer.word_index
# Convert the text data to sequences
sequences = tokenizer.texts_to_sequences(df['full_text'])
# Pad the sequences to a fixed length
max_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length, truncating='post')
# Encode the labels
labels = df['label'].replace({'Offensive': 1, 'Non-offensive': 0}).values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
# Load the GloVe embeddings
embeddings_index = {}
for word, index in glove.key_to_index.items():
    embeddings_index[word] = glove.get_vector(word)
# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
# Build the model
model = Sequential([
    Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

#Result Prediction Using LSTM With Glove Embeddings

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
# Load the data and tokenizer
df = pd.read_csv('/content/drive/MyDrive/txtdatasest/public_data_labeled.csv')
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['full_text'])
# Define the maximum sequence length
max_length = 100  # Update the sequence length to match the model
# Load the trained model
model = load_model('/content/drive/MyDrive/lstmglove_text_model.h5')
attention_model = load_model('/content/drive/MyDrive/lstmglove_text_model.h5')
# Function to predict cyberbullying and provide reasoning
def predict_cyberbullying(text):
    # Preprocess the input text
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, truncating='post')
    # Predict using the trained model
    prediction = model.predict(np.array(padded_sequence))[0][0]
    # Get attention weights for the input text
    attention_weights = attention_model.predict(np.array(padded_sequence))[0]
    # Determine if it's a cyberbullying tweet or not
    if prediction >= 0.5:
        label = "Bullying"
    else:
        label = "Not Bullying"
    # Get the most relevant words based on attention weights
    relevant_words = [tokenizer.index_word[index] for index, weight in enumerate(attention_weights) if weight > 0.5 and index in tokenizer.index_word]
    return label, relevant_words
# User input
user_input = input("Enter a tweet: ")
# Predict and provide reasoning
prediction_label, reasoning_words = predict_cyberbullying(user_input)
# Output the prediction and reasoning
print("Prediction:", prediction_label)
print("Reasoning Words:", reasoning_words)