In [None]:
pip install pyicu

In [None]:
pip install pycld2

# Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

import nltk
from nltk.corpus import wordnet, stopwords
from nltk import *
from wordcloud import WordCloud, STOPWORDS
import re

import sys
from termcolor import colored
from polyglot.detect import Detector
from polyglot.utils import pretty_list

# Reading the training data file

In [None]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [None]:
train1.head()

In [None]:
train2.head()

In [None]:
train2.toxic = train2.toxic.round().astype(int)
train = pd.concat([train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000)
    ])
#rate=10
#train = train[::rate]
train.head()

# Descriptive Analysis

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
print(f"Train data shape: {colored(train.shape, 'red', attrs=['bold'])}")

# Checking for Null Values

In [None]:
train.isnull().sum()

# Exploratory Data Analysis
### Word Cloud Creation

In [None]:
def slashn(x):
    if type(x) == str:
        return x.replace("\n", "")
    else:
        return ""

In [None]:
nltk.download('punkt') #tokenizer
nltk.download('stopwords') #handle stopwords
nltk.download('wordnet') #Lemmatization

stop_words = stopwords.words('english')

In [None]:
def word_count_review(docs):
    text = ' '.join([slashn(abstract) for abstract in docs])
    corpus = str(text.lower())
    txt = re.sub(r'[^a-z0-9]+',' ',str(corpus)).strip()
    tokens = word_tokenize(txt)
    words = [t for t in tokens if t not in stop_words]
    lemma = WordNetLemmatizer()
    l = [lemma.lemmatize(w) for w in words]
    fdq = FreqDist(l)
    return fdq

In [None]:
fd = word_count_review(train.comment_text)
fd

In [None]:
plt.figure(figsize=(10,10),dpi=150)
wc = WordCloud(scale=10).generate_from_frequencies(fd)

plt.imshow(wc)
plt.axis('off')

## Getting acquainted with POLYGLOT library
#### Languages Supported

In [None]:
print(pretty_list(Detector.supported_languages()))

In [None]:
def get_language(text):
    return Detector("".join(x for x in text if x.isprintable()),quiet=True).languages[0].name
h = get_language("Helló, hogy vagy")
i = get_language("Dia duit, conas atá tú")
e = get_language("hello, how are you")
p = get_language("ਹੈਲੋ ਤੁਸੀ ਕਿਵੇਂ ਹੋ")
t = get_language("сәлам, хәлләрең ничек")
k = get_language("안녕하세요. 어떻게 지내세요")
m = get_language("ഹലോ, നിങ്ങൾക്ക് സുഖമാണോ")

In [None]:
print(f"Helló, hogy vagy: {colored(h, 'blue', attrs=['bold','underline'])}")
print(f"Dia duit, conas atá tú: {colored(i, 'red', attrs=['bold','underline'])}")
print(f"hello, how are you: {colored(e, 'yellow', attrs=['bold','underline'])}")
print(f"ਹੈਲੋ ਤੁਸੀ ਕਿਵੇਂ ਹੋ: {colored(p, 'cyan', attrs=['bold','underline'])}")
print(f"сәлам, хәлләрең ничек: {colored(t, 'white', attrs=['bold','underline'])}")
print(f"안녕하세요. 어떻게 지내세요: {colored(k, 'magenta', attrs=['bold','underline'])}")
print(f"ഹലോ, നിങ്ങൾക്ക് സുഖമാണ: {colored(m, 'green', attrs=['bold','underline'])}")

#### Assigning a column of languages corresponding to the comment_text column

In [None]:
train['language'] = train["comment_text"].apply(get_language)

In [None]:
train.head()

#### Languages in the training data set and their counts

In [None]:
train.language.unique()

In [None]:
print("Number of Unique Languages:",train.language.nunique())

### Comparison between English and Non English Languages

In [None]:
import plotly.express as px

lang_list = sorted(list(set(train["language"])))
counts = [list(train["language"]).count(cont) for cont in lang_list]
df = pd.DataFrame(np.transpose([lang_list, counts]))
df.columns = ["Language", "Count"]
df["Count"] = df["Count"].apply(int)

df_en = pd.DataFrame(np.transpose([["English", "Non-English"], [max(counts), sum(counts) - max(counts)]]))
df_en.columns = ["Language", "Count"]
df_en.head()

In [None]:
df_en.Count = df_en.Count.astype(int)

In [None]:
df_en.plot.bar(x="Language", y="Count", rot=0)

### Comparison of Non English languages with comments appearing between 20 and 30 times

In [None]:
dfq = df.query("Language != 'English' and Language != 'un'").query("Count >= 20 and Count <= 30")
fig1 = px.bar(dfq, y="Language", x="Count", title="Language of non-English comments", text="Count", orientation="h",
             pattern_shape="Language", pattern_shape_sequence=["|", "/", "+"], height=500)
fig1.update_traces(texttemplate='%{text:.2s}',  textposition="outside",marker_color='teal')
fig1.update_layout(showlegend=False)
fig1

### Comparison of Non English languages with comments appearing more than 50 times

In [None]:
dfq1 = df.query("Language != 'English' and Language != 'un'").query("Count >= 50")
fig1 =px.scatter(dfq1, y="Language", x="Count", title="Count of non-English Language", size="Count", color="Language", log_x=True, size_max=60)
fig1.update_traces(mode="markers")
fig1.update_layout(showlegend=True)
fig1

### Comment Word Distribution

In [None]:
import plotly.figure_factory as ff

def new_len(x):
    if type(x) is str:
        return len(x.split())
    else:
        return 0

train["comment_words"] = train["comment_text"].apply(new_len)
nums = train.query("comment_words != 0 and comment_words < 200")["comment_words"]
fig = ff.create_distplot(hist_data=[nums],group_labels=["All comments"],colors=["indigo"])

fig.update_layout(title_text="Word distribution per Comment", xaxis_title="Comment words", showlegend=False)
fig.show()

### Average Comment Words vs Language
Languages where the average number of words in comments is less than 200

In [None]:
import plotly.graph_objects as go

dfaa = pd.DataFrame(np.transpose([lang_list, train.groupby("language").mean()["comment_words"]]))
dfaa.columns = ["Language", "avg_comment_words"]
dfaa["avg_comment_words"] = dfaa["avg_comment_words"].apply(float)
dfaa = dfaa.query("avg_comment_words < 200")
fig = go.Figure()
fig.add_trace(go.Bar(y=dfaa["avg_comment_words"], x=dfaa["Language"]))
fig.update_layout(xaxis_title="Average Count of words", yaxis_title="Language", title_text="Language Versus Average Number of Words in comments")
fig.show()

Languages where the average number of words in comments is more than 200

In [None]:
import plotly.graph_objects as go

dfab = pd.DataFrame(np.transpose([lang_list, train.groupby("language").mean()["comment_words"]]))
dfab.columns = ["Language", "avg_comment_words"]
dfab["avg_comment_words"] = dfab["avg_comment_words"].apply(float)
dfab = dfab.query("avg_comment_words > 200")
fig = go.Figure()
fig.add_trace(go.Bar(y=dfab["avg_comment_words"], x=dfab["Language"]))
fig.update_layout(xaxis_title="Average Count of words", yaxis_title="Language", title_text="Language Versus Average Number of Words in comments")
fig.show()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def polarity_score(x):
    if type(x) == str:
        return SIA.polarity_scores(x)
    else:
        return 1000
    
SIA = SentimentIntensityAnalyzer()
train["polarity"] = train["comment_text"].progress_apply(polarity_score)

In [None]:
fig = go.Figure(go.Histogram(x=[pols["neg"] for pols in train["polarity"] if pols["neg"] != 0], marker=dict(color='teal')))

fig.update_layout(title_text="Negative sentiment", template="simple_white")
fig.show()

In [None]:
fig = go.Figure(go.Histogram(x=[p["pos"] for p in train["polarity"] if p["pos"] != 0], marker=dict(color='darkblue')))
fig.update_layout(title_text="Positive sentiment", template="simple_white")
fig.show()

Toxicity in Comparison with Negativity and Positivity

In [None]:
#train["negativity"] = train["polarity"].apply(lambda x: x["neg"])
#one = train.query("toxic == 1")["negativity"]
#zero = train.query("toxic == 0")["negativity"]

#fig = ff.create_distplot(hist_data=[one, zero],group_labels=["Toxic", "Non-toxic"],colors=["slategrey", "dodgerblue"], show_hist=False)

#fig.update_layout(title_text="Negativity vs. Toxicity", xaxis_title="Negativity", template="simple_white")
#fig.show()

In [None]:
#train["positivity"] = train["polarity"].apply(lambda x: x["pos"])
#nums_1 = train.sample(frac=0.1).query("toxic == 1")["positivity"]
#nums_2 = train.sample(frac=0.1).query("toxic == 0")["positivity"]

#fig = ff.create_distplot(hist_data=[nums_1, nums_2],group_labels=["Toxic", "Non-toxic"], colors=["dodgerblue", "purple"], show_hist=False)

#fig.update_layout(title_text="Positivity vs. Toxicity", xaxis_title="Positivity", template="simple_white")
#fig.show()

### Readability

In [None]:
pip install textstat

In [None]:
import textstat
train["flesch_reading_ease"] = train["comment_text"].progress_apply(textstat.flesch_reading_ease)
fig = go.Figure(go.Histogram(x=train.query("flesch_reading_ease > 0")["flesch_reading_ease"], marker=dict(color='dodgerblue')))

fig.update_layout(title_text="Flesch reading ease", template="simple_white")
fig.show()