In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import zipfile



# Will unzip the files so that you can see them..
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip","r") as z:
    z.extractall(".")
    
with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip","r") as z:
    z.extractall(".")
    


## Load training and test data

In [None]:
train_df=pd.read_csv("./train.csv")

test_df=pd.read_csv("./test.csv")


## Examine the data (EDA)

In [None]:
train_df.sample(5)

In the training data, the comments are labelled as one or more of the six categories; toxic, severe toxic, obscene, threat, insult and identity hate. This is essentially a multi-label classification problem.

In [None]:
cols_target = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']

In [None]:
# check missing values in numeric columns
train_df.describe()

There are no missing numeric values. 
As the mean values are very small (some way below 0.05), there would be many not labelled as positive in the six categories. From this I guess that there would be many comments which are not labelled in any of the six categories. Let's take a look.

In [None]:
unlabelled_in_all = train_df[(train_df['toxic']!=1) & (train_df['severe_toxic']!=1) & (train_df['obscene']!=1) & 
                            (train_df['threat']!=1) & (train_df['insult']!=1) & (train_df['identity_hate']!=1)]
print('Percentage of unlabelled comments is ', len(unlabelled_in_all)/len(train_df)*100)

In [None]:
# check for any 'null' comment
no_comment = train_df[train_df['comment_text'].isnull()]
len(no_comment)

In [None]:
test_df.head()

In [None]:
no_comment = test_df[test_df['comment_text'].isnull()]
no_comment

All rows in the training and test data contain comments, so there's no need to clean up null fields.

In [None]:
# let's see the total rows in train, test data and the numbers for the various categories
print('Total rows in test is {}'.format(len(test_df)))
print('Total rows in train is {}'.format(len(train_df)))
print(train_df[cols_target].sum())

As mentioned earlier, majority of the comments in the training data are not labelled in one or more of these categories.

In [None]:
# Let's look at the character length for the rows in the training data and record these
train_df['char_length'] = train_df['comment_text'].apply(lambda x: len(str(x)))

In [None]:
# look at the histogram plot for text length
sns.set()
train_df['char_length'].hist()
plt.show()

Most of the text length are within 500 characters, with some up to 5,000 characters long.

Next, let's examine the correlations among the target variables.

In [None]:
data = train_df[cols_target]

In [None]:
colormap = plt.cm.plasma
plt.figure(figsize=(7,7))
plt.title('Correlation of features & targets',y=1.05,size=14)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1.0,square=True,cmap=colormap,
           linecolor='white',annot=True)

Indeed, it looks like some of the labels are higher correlated, e.g. insult-obscene has the highest at 0.74, followed by toxic-obscene and toxic-insult.

What about the character length & distribution of the comment text in the test data?

In [None]:
test_df['char_length'] = test_df['comment_text'].apply(lambda x: len(str(x)))

In [None]:
plt.figure()
plt.hist(test_df['char_length'])
plt.show()

Now, the shape of character length distribution looks similar between the training data and the train data. For the training data, I guess the train data were clipped to 5,000 characters to facilitate the folks who did the labelling of the comment categories.

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = STOPWORDS,
                min_font_size = 10).generate(train_df.comment_text[3])
  
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

## Clean up the comment text

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()


def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    
    return " ".join(filtered_words)


In [None]:
train_df.comment_text.head()

In [None]:
# clean the comment_text in train_df 
train_df['comment_text']=train_df['comment_text'].map(lambda s:preprocess(s)) 

In [None]:
train_df.comment_text[3]

In [None]:
train_df.to_csv("cleaned_trained.csv",index=False)

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag,wordnet.NOUN)

In [None]:
from nltk.corpus import wordnet

In [None]:
train_df['comment_text'] =  train_df['comment_text'].apply(lambda sentence: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]))

In [None]:
train_df.to_csv("clean_lemmatized_trained2.csv",index=False)

In [None]:
train_df.comment_text[3]

In [None]:
train_dfx = pd.read_csv("../input/jigsaw-toxic-comment-support/clean_lemmatized_trained2.csv")

In [None]:
train_df.head()

In [None]:
train_dfx.head()

In [None]:
test_df['comment_text'] = test_df['comment_text'].map(lambda s:preprocess(s)) 

In [None]:
test_df['comment_text'] = test_df['comment_text'].apply(lambda sentence: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)]))

In [None]:
test_df.to_csv("clean_lemmatized_test.csv",index=False)

In [None]:
test_dfx = pd.read_csv("../input/jigsaw-toxic-comment-support/clean_lemmatized_test.csv")

In [None]:
X = train_dfx.comment_text
test_X = test_dfx.comment_text

In [None]:
print(X.shape, test_X.shape)

## Vectorize the data

In [None]:
# import and instantiate TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=1000,stop_words='english')
vect

In [None]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X.values.astype('U'))
# examine the document-term matrix created from X_train
X_dtm

In [None]:
lendata = []
for i in range(159571):
    lendata.append(X_dtm[i].size)

In [None]:
max(lendata)

In [None]:
#plt.hist(lendata, bins=10)
plt.hist(lendata, bins=10,
         histtype='stepfilled', color='steelblue',
         edgecolor='none');

In [None]:
import numpy as np
import matplotlib.pyplot as plt

bins = [0,2,4,8,16,32,64,128] # your bins

hist, bin_edges = np.histogram(lendata,bins) # make the histogram

fig,ax = plt.subplots()

# Plot the histogram heights against integers on the x axis
ax.bar(range(len(hist)),hist,width=1) 

# Set the ticks to the middle of the bars
#ax.set_xticks([0.5+i for i,j in enumerate(hist)])

# Set the xticklabels to a string that tells us what the bin edges were
ax.set_xticklabels(['{} - {}'.format(bins[i],bins[i+1]) for i,j in enumerate(hist)])

plt.show()

In [None]:
df = pd.DataFrame(X_dtm.toarray(),columns=vect.get_feature_names())
df.head()

In [None]:
for key in list(vect.vocabulary_)[:10]:
    print(key, vect.vocabulary_[key])

In [None]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X.values.astype('U'))
# examine the document-term matrix from X_test
test_X_dtm