In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
import seaborn as sns
import matplotlib.cm as cm
import itertools
%matplotlib inline


In [None]:
df = pd.read_csv('./dataset/train.csv')

df.head(10)

In [None]:
# Select only the relevant columns
selected_columns = ['id', 'comment_text', 'toxic', 'obscene', 'insult']
new_df = df[selected_columns]

# Display the first 10 rows of the new DataFrame
new_df.head(10)

In [None]:
new_df.shape
new_df.dtypes
new_df.columns


In [None]:
new_df.describe()

new_df.info()

In [None]:
rowsums=new_df.iloc[:,2:].sum(axis=1)
new_df['clean']=(rowsums==0)
new_df['clean'].sum()

In [None]:
# Total no.of toxic comments
len(new_df[new_df['toxic']==1])

In [None]:
comment = new_df['comment_text']
for i in range(5):
    print(i,"- " + comment[i] + "\n Length -" ,len(comment[i]))

In [None]:
# creating a numpy array of the length of each comment in the dataset.
x = np.array([len(comment[i]) for i in range(comment.shape[0])])

print("""The maximum length of comment is:{} 
        \nThe minimum length of the comment is:{} 
        \nAnd the average length of a comment is: {}""".format(x.max(),x.min(),x.mean()))

In [None]:
bins = [1,200,400,600,800,1000,1200,1400]
plt.hist(x, bins=bins, color = 'Blue')
plt.xlabel('Length of comments')
plt.ylabel('Number of comments')       
plt.axis([0, 1400, 0, 90000])
plt.grid(True)
plt.show()

In [None]:
label = new_df[['toxic',  'obscene' ,  'insult']]
print(label.head(10))
label = label.values
label.shape

In [None]:

# creating a list of comments with less than 400 length of words.
trim_comments = [comment[i] for i in range(comment.shape[0]) if len(comment[i])<=400 ]

In [None]:

# creating a list of comments with less than 400 length of words.
trim_comments = [comment[i] for i in range(comment.shape[0]) if len(comment[i])<=400 ]

In [None]:
my_labels[:10, :]

print(len(trim_comments))
print(len(my_labels))
print("Thus number of removed comments = {}".format(159571-115910))

print(len(trim_comments))
print(my_labels.shape)

In [None]:
# Punctuation removal

import string
print(string.punctuation)
punctuation_edit = string.punctuation.replace('\'','') +"0123456789"
print (punctuation_edit)
outtab = "                                         "
trantab = str.maketrans(punctuation_edit, outtab)

In [None]:
# Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Adding alphabets to the set
for i in range(ord('a'),ord('z')+1):
    stop_words.add(chr(i))
print(stop_words)


In [None]:
# Stemming and Lemmatizing
from nltk.stem import WordNetLemmatizer, PorterStemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
for i in range(len(trim_comments)):
    trim_comments[i] = trim_comments[i].lower().translate(trantab)
    word_list = []
    for word in trim_comments[i].split():
        if not word in stop_words:
            word_list.append(stemmer.stem(lemmatizer.lemmatize(word,pos="v")))
    trim_comments[i]  = " ".join(word_list)

In [None]:

# Comments after stop words removal, stemming and lemmatizing.
for i in range(5):
    print(trim_comments[i],"\n")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Applying count vectorizer
count_vector = CountVectorizer(stop_words='english')

 
tf = count_vector.fit_transform(trim_comments[:40000]).toarray()

custom_stop_words_list = list(stop_words)
count_vector = CountVectorizer(stop_words=custom_stop_words_list)
tf = count_vector.fit_transform(trim_comments[:40000]).toarray()

In [None]:
tf.shape

In [None]:
def shuffle(matrix, target, test_proportion):
    ratio = int(matrix.shape[0]/test_proportion)
    X_train = matrix[ratio:,:]
    X_test =  matrix[:ratio,:]
    Y_train = target[ratio:,:]
    Y_test =  target[:ratio,:]
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = shuffle(tf, my_labels[:40000],3)

print(X_test.shape)
print(X_train.shape)

In [None]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

def evaluate_score(Y_test,predict): 
    loss = hamming_loss(Y_test,predict)
    print("Hamming_loss : {}".format(loss*100))
    accuracy = accuracy_score(Y_test,predict)
    print("Accuracy : {}".format(accuracy*100))
    try : 
        loss = log_loss(Y_test,predict)
    except :
        loss = log_loss(Y_test,predict.toarray())
    print("Log_loss : {}".format(loss))

In [None]:
class MySVC:
    def __init__(self, C=1.0):
        self.C = C
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0.0

        learning_rate = 0.01
        num_iterations = 1000

        for _ in range(num_iterations):
            predictions = self.predict(X)
            errors = y - predictions

            weight_gradient = -(2 / num_samples) * X.T @ errors
            bias_gradient = -(2 / num_samples) * np.sum(errors)

            self.weights -= learning_rate * weight_gradient
            self.bias -= learning_rate * bias_gradient

    def predict(self, X):
        return X @ self.weights + self.bias

    def threshold(self, predictions, threshold=0.5):
        return (predictions >= threshold).astype(int)

In [None]:
label_plot = ['toxic', 'obscene', 'insult']

In [None]:
class MyBinaryRelevance:
    def __init__(self, classifier):
        self.classifier = classifier
        self.models = []

    def fit(self, X_train, Y_train):
        num_labels = Y_train.shape[1]

        for i in range(num_labels):
            model = self.classifier()
            model.fit(X_train, Y_train[:, i])
            self.models.append(model)

    def predict(self, X_test):
        predictions = np.zeros((X_test.shape[0], len(self.models)))

        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X_test)

        return predictions


In [None]:
import joblib
classifier = MyBinaryRelevance(classifier=MySVC(), require_dense=[False, True])
classifier.fit(X_train, Y_train)
# Save the trained model
joblib.dump(classifier, "svm_model.joblib")



In [None]:
# Load the saved model
loaded_classifier = joblib.load("svm_model.joblib")

# Use the loaded model for predictions
predictions_loaded = loaded_classifier.predict(X_test)

In [None]:
# Calculate scores for the loaded model
evaluate_score(Y_test, predictions_loaded)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfTransformer



# Preprocess the new comment similar to the training data
new_comment = "see my ass"
new_comment = new_comment.lower().translate(str.maketrans('', '', string.punctuation))
new_comment = " ".join([stemmer.stem(lemmatizer.lemmatize(word, pos="v")) for word in new_comment.split() if word not in stop_words])

# Use the saved SVM model for prediction
new_comment_vectorized = count_vector.transform([new_comment]).toarray()
predictions_svm = loaded_classifier.predict(new_comment_vectorized)

# Display the predictions
for i, label in enumerate(label_plot):
    print(f"{label}: {predictions_svm[0, i]}")
