In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/jigsaw-toxic-comment-classification-challenge/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
test_data = pd.read_csv("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.describe()

In [None]:
train_data.info()

In [None]:
train_data.isna().sum()

# Text cleaning 

1. Removing punctuation 
2. Removing Stop words 
3. Stemming the words 

In [None]:
import string
import re
import nltk
stopwords = nltk.corpus.stopwords.words("english")
from nltk.stem import PorterStemmer
ps = PorterStemmer()

# Tokenize 

In [None]:
def remove_punc(text):
    word = "".join([char.lower() for char in text if char not in string.punctuation])
    return word

train_data["removed_punch"] = train_data['comment_text'].apply(lambda x : remove_punc(x))
train_data.head()

In [None]:
def token(text):
    word = re.split("\W+",text)
    return word
train_data["token_word"] = train_data['removed_punch'].apply(lambda x : token(x))
train_data.head()

# Text Vectorization 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
word_vector = TfidfVectorizer(tokenizer=token,analyzer='word',max_features=1000)

In [None]:
train_vectorization = word_vector.fit_transform(train_data['comment_text'])
test_vectorization = word_vector.fit_transform(test_data['comment_text'])

In [None]:
train_vectorization.shape

In [None]:
test_vectorization.shape

In [None]:
# Creating DataFrame 
train_vectorization_df = pd.DataFrame(train_vectorization.toarray(), columns=word_vector.get_feature_names())
test_vectorization_df = pd.DataFrame(test_vectorization.toarray(), columns=word_vector.get_feature_names())

In [None]:
y_train = train_data[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
X_train = train_vectorization_df
X_test = test_vectorization_df

# Machine Learning model 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
target_label = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

In [None]:
predicted = np.zeros((X_test.shape[0],y_train.shape[1]))
predicted

In [None]:
for i,label in enumerate(target_label):
    lr = LogisticRegression(C=2,random_state = i,class_weight = 'balanced')
    print('Building {} model for column:{''}'.format(i,label)) 
    lr.fit(X_train,y_train[label])

In [None]:
from sklearn.metrics import classification_report
label = 'insult'
y_pred = lr.predict(X_train)
print(classification_report(y_train[label],y_pred))

In [None]:
for i in target_label:
    print(" Lable ",i,classification_report(y_train[i],y_pred))

In [None]:
y_predicted_labels = lr.predict_proba(X_train)[:,1]
y_predicted_labels

# ROC 

In [None]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_train['insult'], y_predicted_labels)
metrics.auc(fpr, tpr)

# Submission

In [None]:
test_predicted = pd.DataFrame(predicted,columns=y_train.columns)
submission = pd.concat([test_data['id'],test_predicted],axis=1)
submission.to_csv('submit.csv',index=False)