In [None]:
# !unzip -u '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/train.csv (2).zip'
# !unzip -u '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/test.csv.zip'

Importing Modules

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import re
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import SnowballStemmer, PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.linear_model import SGDRegressor 
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')


Read train and test

In [None]:
df_train = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
df_test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

In [None]:
df_train.head()

Shape of train and test

In [None]:
print("train data shape : ", df_train.shape)
print("Test data shape : ", df_test.shape)

**Null value**

In [None]:
pd.DataFrame(df_train.isnull().sum()/ df_train.shape[0])

There are lot null values but we don't care we only care about that all, comment_text and target

In [None]:
plt.figure(figsize=(10, 8))
sns.distplot(df_train['target'], kde=True, hist=False)
plt.title("Distribution of target")
plt.grid()
plt.show()

In [None]:
data = df_train['target'].apply(lambda x: 1 if x>=0.5 else 0)
total = float(len(data))


fig, ax = plt.subplots(1, 1, figsize=(10, 8))
sns.countplot(data)
plt.title('Target Countplot')

for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x() + p.get_width()/2.0, height+4, '{:1.2f}%'.format(100*height/total))


Our data is unbalanced. 

there is only 8% non-toxic comment and 92% toxic comment

In [None]:
df_train.columns

In [None]:
def distplot_feature(features,title, data):
  plt.figure(figsize=(10, 8))
  plt.title(title)
  for feature in features:
    sns.distplot(data[feature], kde=True, hist=False, label=feature)
  plt.legend()
  plt.xlabel(" ")
  plt.show()

In [None]:
features = [ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]

distplot_feature(features, "Distribution of additional toxicity features on non-toxic comment" ,df_train[df_train['target']<0.5])

In [None]:
features = [ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]

distplot_feature(features, "Distribution of additional toxicity features on toxic comment" ,df_train[df_train['target']>0.5])

There are more insulting comment

In [None]:
def comment_type(data):
  data = [ data['severe_toxicity'], data['obscene'], data['identity_attack'], data['insult'], data['threat'] ]
  data = np.argmax(data)
  if data == 0: 
    return 'severe_toxicity'
  elif data == 1:
    return 'obscene'
  elif data == 2:
    return 'identity_attack'
  elif data == 3:
    return 'insult'
  else:
    return 'threat'

In [None]:
data = df_train[df_train['target']>=0.5].apply(comment_type, axis=1)

total = float(len(data))
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
sns.countplot(data)
plt.title("Percentage of type of toxicity ")
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x() + p.get_width()/2, height + 3, '{:1.2f}%'.format(100*height/total), ha='center')

from 8% toxic comment, 78% of the toxic comment made are insults, 6.75% are obscene, 10.62% are identity_attack, 3.73% are threat, 0.16% are severe toxicity.

**Gender attributes**

male

female

homosexual_gay_or_lesbian

bisexual

heterosexual

other_gender 

transgender

In [None]:
features = ['male', 'female', 'homosexual_gay_or_lesbian', 'bisexual', 'heterosexual', 'other_gender', 'transgender']
distplot_feature(features, "Distribution of gender features on toxic comment", df_train[df_train['target'] > 0.5].dropna(how='any', axis=0))

In [None]:
data = (df_train['male'] > 0.5) | (df_train['female'] > 0.5) | (df_train['homosexual_gay_or_lesbian'] > 0.5) | (df_train['bisexual'] > 0.5) | (df_train['heterosexual'] > 0.5) | (df_train['other_gender'] > 0.5) | (df_train['transgender'] > 0.5)
data = df_train[data]
data = data[data['target'] > 0.5]
data = data.apply(comment_type, axis=1)

total = len(data)
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
sns.countplot(data)
plt.title("Percentage of type of toxicity in comments gender reference made ")
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x() + p.get_width()/2, height + 3, '{:1.2f}%'.format(100*height/total), ha='center')

We see from the plot that the toxic comments where sexual orientation references are made are mostly used for insult and identity attacks.

In [None]:
df_train.columns

In [None]:
features = ['hindu', 'jewish', 'latino', 'muslim', 'atheist', 'other_religion']
distplot_feature(features, "Distriution of religion on toxic comment", df_train[df_train['target'] > 0.5])

In [None]:
data = (df_train['hindu'] > 0.5) | (df_train['jewish'] > 0.5) | (df_train['latino'] > 0.5) | (df_train['muslim'] > 0.5) | (df_train['atheist'] > 0.5) | (df_train['other_religion'] > 0.5) 
data = df_train[data]
data = data[data['target'] > 0.5]
data = data.apply(comment_type, axis=1)

total = len(data)
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
sns.countplot(data)
plt.title("Percentage of type of toxicity in comments religion reference made ")
for p in ax.patches:
  height = p.get_height()
  ax.text(p.get_x() + p.get_width()/2, height + 3, '{:1.2f}%'.format(100*height/total), ha='center')

We see from the plot that the toxic comments where religion references are made are mostly used for identity attacks and insults.

Features generated by users feedback

*   funny
*   sad
*   wow
*   likes
*   disagree

In [None]:
def count_plot(feature, title, data):
  data = data[feature]
  fig, ax = plt.subplots(1, 1, figsize=(16, 5))
  total = float(len(data))
  sns.countplot(data, order=data.value_counts().index[:15])
  for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2, height+3, '{:1.2f}%'.format(100*height/total))

In [None]:
count_plot('funny', "Percentage of funny votes given", df_train)
count_plot('funny', "Percentage of funny votes given on toxic comment", df_train[df_train['target'] >= 0.5])

In [None]:
count_plot('sad', "Percentage of sad votes given", df_train)
count_plot('sad', "Percentage of sad votes given on toxic comment", df_train[df_train['target'] >= 0.5])

In [None]:
count_plot('wow', "Percentage of wow votes given", df_train)
count_plot('wow', "Percentage of wow votes given on toxic comment", df_train[df_train['target'] >= 0.5])

In [None]:
count_plot('likes', "Percentage of likes votes given", df_train)
count_plot('likes', "Percentage of likes votes given on toxic comment", df_train[df_train['target'] >= 0.5])

In [None]:
count_plot('disagree', "Percentage of disagree votes given", df_train)
count_plot('disagree', "Percentage of disagree votes given on toxic comment", df_train[df_train['target'] >= 0.5])

**Wordcloud of comment text**

In [None]:
def show_wordcloud(data, title=None):
  wordcloud = WordCloud(
      background_color = 'white',
      stopwords = set(STOPWORDS),
      max_words = 50,
      scale = 5,
      random_state = 1
  ).generate(str(data))

  fig = plt.figure(figsize=(10, 10))
  plt.axis('off')
  if title:
    fig.suptitle(title, fontsize=20)
    fig.subplots_adjust(top=2.3)

  plt.imshow(wordcloud)
  plt.show()

In [None]:
show_wordcloud(df_train['comment_text'].sample(20000), title="Prevalent words in comment data")

In [None]:
show_wordcloud(df_train[df_train['target'] > 0.75]['comment_text'].sample(20000), title="Prevalent words in comment data where target>0.75")

In [None]:
show_wordcloud(df_train[df_train['target'] < 0.25]['comment_text'].sample(20000), title="Prevalent words in comment data where target<0.25")

In [None]:
show_wordcloud(df_train[df_train['threat'] < 0.25]['comment_text'], title="Prevalent words in comment data where threat score<0.25")
show_wordcloud(df_train[df_train['threat'] > 0.75]['comment_text'], title="Prevalent words in comment data where threat score>0.75")

In [None]:
show_wordcloud(df_train[df_train['insult'] < 0.25]['comment_text'], title="Prevalent words in comment data where insult score<0.25")
show_wordcloud(df_train[df_train['insult'] > 0.75]['comment_text'], title="Prevalent words in comment data where insult score>0.75")

**Preprocess text**

In [None]:
nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

def preprocess(str):
  str = str.lower()
  str = re.sub('[^A-Za-z0-9]+', ' ', str)
  words = str.split()
  new_str = []
  for word in words:
    if word not in stop_words:
      new_str.append(stemmer.stem(word))
  new_str = ' '.join(new_str)
  return new_str


In [None]:
%%time
df_train['preprocessed_text'] = df_train['comment_text'].apply(preprocess)

In [None]:
df_train.head()

In [None]:
%%time
df_test['preprocessed_text'] = df_test['comment_text'].apply(preprocess)

In [None]:
df_test.head()

In [None]:
message = df_train['preprocessed_text']
target = df_train['target'] 

train_message, val_message, train_target , val_target = train_test_split(message, target, test_size=0.1)

print("train_message : ", train_message.shape)
print("train_target : ", train_target.shape)
print("val_message : ", val_message.shape)
print("val_target : ", val_target.shape)

In [None]:
test_message = df_test['preprocessed_text']

print("test_message : ", test_message.shape)

In [None]:
# train_message.to_pickle('train_message.pkl')
# train_target.to_pickle('train_target.pkl')
# val_message.to_pickle('val_message.pkl')
# val_target.to_pickle('val_target.pkl')
# test_message.to_pickle('test_message.pkl')

# !cp '/content/train_message.pkl' '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment'
# !cp '/content/train_target.pkl' '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment'
# !cp '/content/val_message.pkl' '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment'
# !cp '/content/val_target.pkl' '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment'
# !cp '/content/test_message.pkl' '/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment'

In [None]:
# train_message = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/train_message.pkl')
# train_target = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/train_target.pkl')
# val_message = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/val_message.pkl')
# val_target = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/val_target.pkl')
# test_message = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/datasets/Toxic Comment/test_message.pkl')

**Count Vectorizer**

In [None]:
%%time
cv = CountVectorizer(max_features = 30000, ngram_range =(1, 2) )
train_message_count = cv.fit_transform(train_message)
val_message_count = cv.transform(val_message)
test_message_count = cv.transform(test_message)

In [None]:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
penalty = ['l1', 'l2']
xticks = []
train_errors = []
val_errors = []
best_model = None
best_error = 100
for a in alpha:
    for p in penalty:
        xticks.append(str(a) + ' ' + p)
        print(str(a) + ' ' + p + " :")
        
        model = SGDRegressor(alpha=a, penalty=p) 
        model.fit(train_message_count, train_target) 
        
        preds = model.predict(train_message_count) 
        err = mean_squared_error(train_target, preds) 
        train_errors.append(err)
        print("Mean Squared Error on train set: ", err)
        
        preds = model.predict(val_message_count) 
        err = mean_squared_error(val_target, preds) 
        val_errors.append(err)
        print("Mean Squared Error on cv set: ", err)
        
        if err < best_error: 
            best_error = err
            best_model = model
        
        print("*"*20)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(range(len(alpha) * len(penalty) ), train_errors)
plt.plot(range(len(alpha) * len(penalty)), val_errors)
plt.suptitle("Mean squared error vs Hyper parameter")
plt.legend(['train', 'val'])
plt.xticks(range(len(alpha) * len(penalty)), xticks, rotation=45)
plt.xlabel('Hyper parameter(alpha + penalty )')
plt.ylabel("Mean squared error ")
plt.show()

In [None]:
feat_names = cv.get_feature_names()
weights = best_model.coef_
df = pd.DataFrame(data=weights, columns=['weights'], index=feat_names)
df.sort_values("weights", ascending=False).iloc[0:20,:]

In [None]:
max_depth = [3, 5, 7]
min_samp = [10, 100, 500, 1000]
train_errors = []
val_errors = []

best_model = None
best_error = 10

for d in max_depth:
  for s in min_samp:
    dt = DecisionTreeRegressor(max_depth = d, min_samples_leaf = s)
    dt.fit(train_message_count, train_target)

    pred = dt.predict(train_message_count) 
    print("max_depth : ", d, "  min_samples : ", s)
    error = mean_squared_error(pred, train_target)
    print("Train mse : ", error)
    train_errors.append(error)

    pred = dt.predict(val_message_count) 
    error = mean_squared_error(pred, val_target)
    print("val mse : ", error)
    val_errors.append(error)

    if error < best_error:
      best_model = dt
      best_error = error
      print('*'*30)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(range(len(max_depth) * len(min_samp) ), train_errors)
plt.plot(range(len(max_depth) * len(min_samp)), val_errors)
plt.suptitle("Mean squared error vs Hyper parameter")
plt.legend(['train', 'error'])
plt.xlabel('Hyper parameter(max_depth + min_samples )')
plt.ylabel("Mean squared error ")
plt.show()

In [None]:
feat_names = cv.get_feature_names()
weights = best_model.feature_importances_
df = pd.DataFrame(data=weights, columns=['weights'], index=feat_names)
df.sort_values("weights", ascending=False).iloc[0:20,:]

TfIdf vectorizer

In [None]:
%%time
tfidf = TfidfVectorizer(max_features = 30000, ngram_range =(1, 2) )
train_message_tfidf = tfidf.fit_transform(train_message)
val_message_tfidf = tfidf.transform(val_message)
test_message_tfidf = tfidf.transform(test_message)

In [None]:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
penalty = ['l1', 'l2']
xticks = []
train_errors = []
val_errors = []
best_model = None
best_error = 100
for a in alpha:
    for p in penalty:
        xticks.append(str(a) + ' ' + p)
        print(str(a) + ' ' + p + " :")
        
        model = SGDRegressor(alpha=a, penalty=p) 
        model.fit(train_message_tfidf, train_target) # Train
        
        preds = model.predict(train_message_tfidf) # Get predictions
        err = mean_squared_error(train_target, preds) # Calculate error on trainset
        train_errors.append(err)
        print("Mean Squared Error on train set: ", err)
        
        preds = model.predict(val_message_tfidf) # Get predictions on CV set
        err = mean_squared_error(val_target, preds) # Calculate error on cv set
        val_errors.append(err)
        print("Mean Squared Error on cv set: ", err)
        
        if err < best_error: # Get best model trained
            best_error = err
            best_model = model
        
        print("*"*20)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(range(len(alpha) * len(penalty) ), train_errors)
plt.plot(range(len(alpha) * len(penalty)), val_errors)
plt.suptitle("Mean squared error vs Hyper parameter")
plt.legend(['train', 'error'])
plt.xlabel('Hyper parameter(alpha + penalty )')
plt.ylabel("Mean squared error ")
plt.show()

In [None]:
feat_names = tfidf.get_feature_names()
weights = best_model.coef_
df = pd.DataFrame(data=weights, columns=['weights'], index=feat_names)
df.sort_values("weights", ascending=False).iloc[0:20,:]

Decision tree on tfidf

In [None]:
max_depth = [3, 5, 7]
min_samp = [10, 100, 1000]
train_errors = []
val_errors = []

best_model = None
best_error = 10

for d in max_depth:
  for s in min_samp:
    dt = DecisionTreeRegressor(max_depth = d, min_samples_leaf = s)
    dt.fit(train_message_tfidf, train_target)

    pred = dt.predict(train_message_tfidf) 
    print("max_depth : ", d, "  min_samples : ", s)
    error = mean_squared_error(pred, train_target)
    print("Train mse : ", error)
    train_errors.append(error)

    pred = dt.predict(val_message_tfidf) 
    error = mean_squared_error(pred, val_target)
    print("val mse : ", error)
    val_errors.append(error)

    if error < best_error:
      best_model = dt
      best_error = error
    print('*'*30)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(range(len(max_depth) * len(min_samp) ), train_errors)
plt.plot(range(len(max_depth) * len(min_samp)), val_errors)
plt.suptitle("Mean squared error vs Hyper parameter")
plt.legend(['train', 'error'])
plt.xlabel('Hyper parameter(max_depth + min_samples )')
plt.ylabel("Mean squared error ")
plt.show()

In [None]:
feat_names = cv.get_feature_names()
weights = best_model.feature_importances_
df = pd.DataFrame(data=weights, columns=['weights'], index=feat_names)
df.sort_values("weights", ascending=False).iloc[0:20,:]