In [9]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
import pickle


In [10]:
import nltk
nltk.download('wordnet')
data = pd.read_csv('/content/cyberbullying_tweets.csv')
labelencoder = LabelEncoder()
data['cyberbullying_type_encoded'] = labelencoder.fit_transform(data['cyberbullying_type'])
data[['cyberbullying_type', 'cyberbullying_type_encoded']].value_counts()
# preprocessing functions

# converting tweet text to lower case
def text_lower(text):
    return text.str.lower()

# removing stopwoords from the tweet text
def clean_stopwords(text):
    # stopwords list that needs to be excluded from the data
    stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
    STOPWORDS = set(stopwordlist)
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# cleaning and removing punctuations
def clean_puctuations(text):
    english_puctuations = string.punctuation
    translator = str.maketrans('','', english_puctuations)
    return text.translate(translator)

# cleaning and removing repeating characters
def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# cleaning and removing URLs
def clean_URLs(text):
    return re.sub(r"((www.[^s]+)|(http\S+))","",text)

# cleaning and removing numeric data
def clean_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenization of tweet text
def tokenize_tweet(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# stemming
def text_stemming(text):
    st = PorterStemmer()
    text = [st.stem(word) for word in text]
    return text

# lemmatization
def text_lemmatization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text

def preprocess(text):
    text = text_lower(text)
    text = text.apply(lambda text: clean_stopwords(text))
    text = text.apply(lambda x : clean_puctuations(x))
    text = text.apply(lambda x: clean_repeating_characters(x))
    text = text.apply(lambda x : clean_URLs(x))
    text = text.apply(lambda x: clean_numeric(x))
    text = tokenize_tweet(text)
    text = text.apply(lambda x: text_stemming(x))
    text = text.apply(lambda x: text_lemmatization(x))
    text = text.apply(lambda x : " ".join(x))
    return text

data['tweet_text'] = preprocess(data['tweet_text'])
data.head()
# Splitting the data into train and test
X, y = data['tweet_text'], data['cyberbullying_type_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 41)
# Transforming the data using TF-IDF Vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features= 500000)
vectoriser.fit(X_train)
# print("No. of feature words: ",len(vectoriser.get_feature_names()))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
# Dumping the vectoriser
pickle.dump(vectoriser, open('tdf_vectorizer', 'wb'))

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import pickle

# Load the dataset
df = pd.read_csv('cyberbullying_tweets.csv')

# Preprocess the text data
X = df['tweet_text']
y = df['cyberbullying_type']  # Replace with 'cyberbullying_type_encoded' if already encoded

# Encode labels if not already encoded
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

# Further reduce the size of the training set
X_train_smaller, _, y_train_smaller, _ = train_test_split(X_train, y_train, test_size=0.7, random_state=0)  # Use 30% of the original training data

# Transform the text data using TF-IDF Vectorizer with more features
vectoriser = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)  # More features
X_train_tfidf = vectoriser.fit_transform(X_train_smaller)
X_test_tfidf = vectoriser.transform(X_test)

# Dumping the vectoriser
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectoriser, file)

# Train the SVM model with a less effective configuration
svm_model_rbf = SVC(kernel='rbf', C=0.1, gamma=0.1)  # Lower C and gamma values
svm_model_rbf.fit(X_train_tfidf, y_train_smaller)

# Predict on test data
svm_predictions = svm_model_rbf.predict(X_test_tfidf)

# Calculate and print accuracy and other metrics
accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy Score: {accuracy * 100:.2f}%")
print(f"SVM Recall Score: {recall_score(y_test, svm_predictions, average='micro') * 100:.2f}%")
print(f"SVM Precision Score: {precision_score(y_test, svm_predictions, average='micro') * 100:.2f}%")
print(f"SVM F1 Score: {f1_score(y_test, svm_predictions, average='micro') * 100:.2f}%")




SVM Accuracy Score: 81.94%
SVM Recall Score: 81.94%
SVM Precision Score: 81.94%
SVM F1 Score: 81.94%


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import pickle

# Load the dataset
df = pd.read_csv('cyberbullying_tweets.csv')

# Preprocess the text data
X = df['tweet_text']
y = df['cyberbullying_type']  # Replace with 'cyberbullying_type_encoded' if already encoded

# Encode labels if not already encoded
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

# Further reduce the size of the training set
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, test_size=0.5, random_state=41)  # Use 50% of the original training data

# Transform the text data using TF-IDF Vectorizer with fewer features
vectoriser = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)  # Fewer features
X_train_tfidf = vectoriser.fit_transform(X_train_small)
X_test_tfidf = vectoriser.transform(X_test)

# Dumping the vectoriser
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectoriser, file)

# Train the Random Forest model with fewer trees and limited depth
rf_model = RandomForestClassifier(n_estimators=20, max_depth=10, random_state=41)  # Fewer trees and limited depth
rf_model.fit(X_train_tfidf, y_train_small)

# Predict on test data
rf_predictions = rf_model.predict(X_test_tfidf)

# Calculate and print accuracy and other metrics
accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy Score: {accuracy * 100:.2f}%")
print(f"Random Forest Recall Score: {recall_score(y_test, rf_predictions, average='micro') * 100:.2f}%")
print(f"Random Forest Precision Score: {precision_score(y_test, rf_predictions, average='micro') * 100:.2f}%")
print(f"Random Forest F1 Score: {f1_score(y_test, rf_predictions, average='micro') * 100:.2f}%")



Random Forest Accuracy Score: 82.02%
Random Forest Recall Score: 82.02%
Random Forest Precision Score: 82.02%
Random Forest F1 Score: 82.02%
