In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import library**

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.metrics import roc_auc_score , accuracy_score , confusion_matrix , f1_score
from sklearn.multiclass import OneVsRestClassifier
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# **Read dataset**

In [41]:
traindf = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
testdf = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")

In [42]:
traindf.head(2)

In [43]:
traindf[traindf['toxic']==1].head(2)

# **Data Preprocessing**

In [44]:
cols_target = []
i = 0
for col in traindf.columns:
    if i == 0 or i == 1:
        i += 1
        continue
    cols_target.append(col)

In [45]:
cols_target

In [46]:
# check missing values in numeric columns
traindf.describe()

In [47]:
len(traindf['toxic'].drop_duplicates())

In [48]:
traindf[traindf['comment_text'].isnull()]

In [49]:
testdf[testdf['comment_text'].isnull()]

In [50]:
print('Total rows in test is {}'.format(len(testdf)))
print('Total rows in train is {}'.format(len(traindf)))
print(traindf[cols_target].sum())

In [51]:
data = traindf[cols_target]

In [52]:
colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

# **Function to Clean the data**

In [1]:
def cleanData(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [54]:
traindf['comment_text'] = traindf['comment_text'].apply(lambda x:cleanData(x))
testdf['comment_text'] = testdf['comment_text'].apply(lambda x:cleanData(x))

# **Removing of stopWords**

In [55]:
traindf['comment_text'].head(5)

In [56]:
eng_stopwords = set(stopwords.words("english"))

In [57]:
traindf['comment_text'] = traindf['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (eng_stopwords)]))
testdf['comment_text'] = testdf['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (eng_stopwords)]))

In [58]:
traindf['comment_text'].head(5)

# **Applying Lemmatization or Stemming**

In [59]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [60]:
# stemming and lemmatizing
# adapted from the kernal 

def stem_word(text):
    txt = " ".join([stemmer.stem(w) for w in text.split()])
    return text

def lemmatize_word(text):
    txt = " ".join([wordnet_lemmatizer.lemmatize(w) for w in text.split()])

In [61]:
SL = -1

In [62]:
if(SL == 1):
    traindf['comment_text'] = traindf['comment_text'].map(lambda x: stem_word(x))
    testdf['comment_text'] = testdf['comment_text'].map(lambda x: stem_word(x))
else:
    traindf['comment_text'] = traindf['comment_text'].map(lambda x: lemmatize_word(x))
    testdf['comment_text'] = testdf['comment_text'].map(lambda x: lemmatize_word(x))

In [63]:
traindf['comment_text']

In [64]:
X = traindf.comment_text
Y = traindf.drop(['id', 'comment_text'], axis = 1)

In [65]:
print(X.shape, Y.shape)

In [66]:
X_train,X_test, y_train,y_test= train_test_split(X,Y,test_size=0.2, random_state=1)

In [67]:
X_train.head(2)

In [68]:
y_train.head(2)

In [71]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# **Term Frequency Inverse Document Frequency Vectorizer**

In [73]:
word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),
    sublinear_tf=True)

word_vectorizer.fit(X_train)    
train_word_features = word_vectorizer.transform(X_train)
test_features = word_vectorizer.transform(X_test)

In [74]:
print(test_features)

In [80]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

# **Model Validation on train data set**

In [81]:
losses = []
auc = []

for class_name in cols_target:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='sag', C=10)

    cv_loss = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=5, scoring='neg_log_loss'))
    losses.append(cv_loss)
    print('CV Log_loss score for class {} is {}'.format(class_name, cv_loss))

    cv_score = np.mean(cross_val_score(classifier, train_word_features, train_target, cv=5, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(train_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
    print(confusion_matrix(test_target, y_pred))
    print(classification_report(test_target, y_pred))

print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

# **Test Prediction**

In [86]:
def make_test_predictions(df,classifier):
    df.comment_text = df.comment_text.apply(cleanData)
    X_test = df.comment_text
    X_test_transformed = word_vectorizer.transform(X_test)
    y_test_pred = classifier.predict_proba(X_test_transformed)
    result =  sum(y_test_pred[0])
    if result >=1 :
       return("Toxic Comment")
    else :
       return ("NonToxic Comment")

In [95]:
log_reg = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=45)
classifier = OneVsRestClassifier(log_reg)
classifier.fit(train_word_features, y_train)

In [96]:
comment_text = "I don't think so"
comment ={'id':[565],'comment_text':[comment_text]}
comment = pd.DataFrame(comment)
result = make_test_predictions(comment,classifier)
print(result)