In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
train_DF = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_DF = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
test_label = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
train_DF

In [None]:
train_DF1 = train_DF[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
train_DF.iloc[:,2:].sum()


# **Visualization**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
categories = list(train_DF1.columns.values)
train_DF.iloc[:,2:].sum()
sns.set(font_scale = 2)
plt.figure(figsize=(15,8))

ax= sns.barplot(categories, train_DF.iloc[:,2:].sum().values)
plt.title("Comments in each category", fontsize=24)
plt.ylabel('Number of comments', fontsize=18)
plt.xlabel('Comment Type ', fontsize=18)

# **Data Exploration**

In [None]:
import re
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
def remove_html(text):
  clean_r =  re.compile('<.*?>')
  clean_text = re.sub(clean_r,'',str(text))
  return clean_text

def remove_puctuations(text):
    clean_punct = re.sub(r'[?|!|\'|"|#]',r'',text)
    clean = re.sub(r'[.|,|)|(|\|/]',r' ',clean_punct)
    clean = clean.strip()
    clean = clean.replace("\n"," ")
    return clean

train_DF['cleaned_comment_text'] = train_DF['comment_text'].str.replace("[^a-zA-Z#]", " ")
train_DF['cleaned_comment_text'] = train_DF['comment_text'].str.lower()
train_DF['cleaned_comment_text'] = train_DF['comment_text'].apply(remove_html)
train_DF['cleaned_comment_text'] = train_DF['comment_text'].apply(remove_puctuations)

In [None]:
# removing Stop words
import nltk
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
def removeStopWords(sentence):
    global re_stop_words
    return re_stop_words.sub(" ", sentence)
train_DF['comment_text'] = train_DF['comment_text'].apply(removeStopWords)

In [None]:
from nltk.stem.snowball import SnowballStemmer
tokenized_text = train_DF['cleaned_comment_text'].apply(lambda x: x.split())
tokenized_text.head()

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

tokenized_text = tokenized_text.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
tokenized_text.head()

In [None]:
for i in range(len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
train_DF['cleaned_comment_text'] = tokenized_text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(train_DF['cleaned_comment_text'])

In [None]:
x_comment = train_DF['cleaned_comment_text']
y_label = train_DF.drop(labels = ['id','comment_text','cleaned_comment_text'], axis=1)

from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(tfidf,y_label, random_state=42, test_size=0.30, shuffle=True)

# **Model Building**

In [None]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())
# train
classifier.fit(train_x, train_y)
# predict
predictions = classifier.predict(test_x)
# accuracy
print("Accuracy = ",accuracy_score(test_y,predictions))
print("\n")

In [None]:
predictions