In [None]:
# imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
from wordcloud import WordCloud
import numpy as np

In [2]:
# create generic path using so that the code can run in both windows and linux systems
raw_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','raw'))
processed_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','processed'))
interim_data_path = os.path.abspath(os.path.join(os.getcwd(),os.path.pardir,'data','interim'))
print(raw_data_path)
print(processed_data_path)
print(interim_data_path)

In [3]:
# Loading data into dataframe
df = pd.read_csv(os.path.abspath(os.path.join(raw_data_path,'train.csv')))

In [4]:
# having a look at data
df.head()

**Insights**
1. the variables toxic, severe_toxic, obscene, threat, insult, identity_hate are the target variables.
2. This is a multi variable classification problem
3. Comment_text contains invalid characters like \n, so there is some cleaning is to be done.
4. id is not providing any useful info. so it is to be dropped.

In [5]:
df.drop('id',inplace=True,axis=1)
df.head()

Unnamed: 0,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [6]:
# lets divide df into multiple dataframes with only one target variable
toxic_df = df[['comment_text','toxic']]
severe_toxic_df = df[['comment_text','severe_toxic']]
obscene_df = df[['comment_text','obscene']]
threat_df = df[['comment_text','threat']]
insult_df = df[['comment_text','insult']]
identity_hate_df = df[['comment_text','identity_hate']]

KeyError: "['toxic'] not in index"

## EDA for toxic_df

In [None]:
toxic_df.head()

In [None]:
# classes in toxic_df
def find_imbalance_class(df,target):
    fig = plt.figure()
    plt.title('Class imbalance plot for {}'.format(target))
    value_counts_series = ((toxic_df['toxic'].value_counts())/(len(toxic_df['toxic'])))*100    
    if (value_counts_series[0]-value_counts_series[1]<-20 or value_counts_series[0]-value_counts_series[1]>20):
        balance_msg = 'Imabalance class'
        balance_clr = 'red'
    else:
        balance_msg = 'Balanced class'
        balance_clr = 'green'        
    plt.text(1.6,140000, balance_msg,bbox=dict(facecolor=balance_clr, alpha=0.5))
    plt.xlabel(target)
    plt.ylabel('value_counts({})'.format(target))
    df[target].value_counts().plot(kind='bar') 

In [None]:
find_imbalance_class(toxic_df,'toxic')

**Insights**
1. toxic_df is very imbalanced class

In [None]:
# divide data into x and y
y = toxic_df['toxic']
X = toxic_df['comment_text']

In [None]:
# get toxic and non toxic comments
X_nontoxic = X[y == 0]
X_toxic = X[y == 1]

In [None]:
# Wordcloud for toxic comments
def draw_wordcloud(text):
    wordcloud = WordCloud().generate(text)
    plt.figure(figsize = (10, 8))
    plt.imshow(wordcloud, interpolation = "bilinear")
    plt.axis("off")

In [None]:
toxic_text = ' '.join(X_toxic)
nontoxic_text = ' '.join(X_nontoxic)

In [None]:
draw_wordcloud(toxic_text)

In [None]:
draw_wordcloud(nontoxic_text)

In [None]:
toxic_symbols = (list(re.findall('\W',toxic_text)))
print("toxic comments have {} symbols and {} unique symbols\n % of * is {} \n% of $ is {}".format(len(toxic_symbols),len(set(toxic_symbols)),(toxic_symbols.count('*')/len(toxic_symbols)),(toxic_symbols.count('$')/len(toxic_symbols))))
nontoxic_symbols = (list(re.findall('\W',nontoxic_text)))
print(" non toxic comments have {} symbols and {} unique symbols\n % of * is {} \n% of $ is {}".format(len(nontoxic_symbols),len(set(nontoxic_symbols)),(nontoxic_symbols.count('*')/len(nontoxic_symbols)),(nontoxic_symbols.count('$')/len(nontoxic_symbols))))