In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd

from datetime import datetime
from collections import Counter
import re, spacy, string
import en_core_web_sm
nlp = en_core_web_sm.load()

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint
import time


# hide warnings
import warnings
warnings.filterwarnings('ignore')
# set options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load data

In [None]:
path = '../input/jigsaw-toxic-comment-classification-challenge/'
df = pd.read_csv(path+'train.csv.zip')
df_test = pd.read_csv(path+'test.csv.zip')
df_submission = pd.read_csv(path+'sample_submission.csv.zip')

df.head()

### Checking shape of the data

In [None]:
df.shape

### Checking missing values

In [None]:
df.isnull().sum()

In [None]:
comments = df.drop(['id','comment_text'],axis = 1)
comments.columns

In [None]:
#Distribution of the target variable data in terms of proportions.

for i in list(comments.columns):
    print("Percent of {0}s: ".format(i), round(100*comments[i].mean(),2), "%")

In [None]:
com_dict = {}
for i in list(comments.columns):
    com_dict[i]=comments[i].sum()

com_list = sorted(com_dict,key=com_dict.get,reverse=True)

### visualization of the distribution of types of toxic comments

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(com_list,comments.sum().sort_values(ascending=False))
plt.xticks(rotation=80)
plt.show()

### Text preprocessing

In [None]:
# Function to clean the review text and remove all the unnecessary elements.

def clean_review_text(text):
    text = text.lower()  # covert the text to lowercase
    text = re.sub('<.*?>','',text).strip() # remove html chars
    text = re.sub('\[|\(.*\]|\)','', text).strip() # remove text in square brackets and parenthesis
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation marks
    text = re.sub("(\\W)"," ",text).strip() # remove non-ascii chars
    text = re.sub('\S*\d\S*\s*','', text).strip()  # remove words containing numbers
    return text.strip()

In [None]:
df.comment_text = df.comment_text.astype(str)
df.comment_text = df.comment_text.apply(clean_review_text)
df.comment_text.head()

### Lemmatization

In [None]:
# Snowball stemmer
import nltk
from nltk.stem.snowball import SnowballStemmer

snow_stemmer = SnowballStemmer(language='english')

stopwords = nlp.Defaults.stop_words
def apply_stemmer(text):
    words = text.split()
    sent = [snow_stemmer.stem(word) for word in words if not word in set(stopwords)]
    return ' '.join(sent)

In [None]:
df.comment_text = df.comment_text.apply(apply_stemmer)
df.comment_text.head()

In [None]:
#Using a word cloud find the top 50 words by frequency among all the review texts
!pip install wordcloud
from wordcloud import WordCloud

wordcloud = WordCloud(stopwords=stopwords,max_words=50).generate(str(df.comment_text))

print(wordcloud)
plt.figure(figsize=(10,6))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
X = df.comment_text
y = df.drop(['id','comment_text'],axis = 1)

In [None]:
# Split the dataset into test and train
from sklearn.model_selection import train_test_split
seed = 100 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


word_vectorizer = TfidfVectorizer(
    strip_accents='unicode',     
    analyzer='word',            
    token_pattern=r'\w{1,}',    
    ngram_range=(1, 3),         
    stop_words='english',
    sublinear_tf=True)

word_vectorizer.fit(X_train)    # Fiting it on Train
train_word_features = word_vectorizer.transform(X_train)

In [None]:
## transforming the train and test datasets
X_train_transformed = word_vectorizer.transform(X_train)
X_test_transformed = word_vectorizer.transform(X_test)


# # Print the shape of each dataset.
print('X_train_transformed', X_train_transformed.shape)
print('y_train', y_train.shape)
print('X_test_transformed', X_test_transformed.shape)
print('y_test', y_test.shape)

In [None]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.metrics import roc_auc_score

import time

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance

In [None]:
# Logistic Regression 
time1 = time.time()
# logistic regression
log_reg = LogisticRegression(C = 10, penalty='l2', solver = 'liblinear', random_state=seed)

# fit model
classifier_ovr_log = OneVsRestClassifier(log_reg)
classifier_ovr_log.fit(X_train_transformed, y_train)

time_taken = time.time() - time1
print('Time Taken: {:.2f} seconds'.format(time_taken))

y_train_pred_proba = classifier_ovr_log.predict_proba(X_train_transformed)
y_test_pred_proba = classifier_ovr_log.predict_proba(X_test_transformed)


roc_auc_score_train = roc_auc_score(y_train, y_train_pred_proba,average='weighted')
roc_auc_score_test = roc_auc_score(y_test, y_test_pred_proba,average='weighted')

print("ROC AUC Score Train:", roc_auc_score_train)
print("ROC AUC Score Test:", roc_auc_score_test)