In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Import necessary packages

In [None]:
import string
import re
from string import digits
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import seaborn as sns
from matplotlib import pyplot as plt

## Read the data into a DataFrame

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
print("\nTrain data: \n",train.head())
print("\nTest data: \n",test.head())

## Drop the null values

In [None]:
train_data=train.drop(train.columns[0], axis=1) 
test_data=test
print(train_data.head())
print(test_data.head())

In [None]:
train_comments=train_data.iloc[:,0]
test_comments=test_data.iloc[:,1]

#saving index to separate them later
train_comments_index=train_comments.index
test_comments_index=test_comments.index

frames = [train_comments, test_comments]
comments = pd.concat(frames, ignore_index=True)


labels=train_data.iloc[:,1:]

print("Train Comments Shape: ",train_comments.shape)
print("Test Comments Shape: ",test_comments.shape)
print("Comments Shape after Merge: ",comments.shape)
print("Comments are: \n",comments.head())
print("\nLabels are: \n", labels.head())

## Remove Punctuation

In [None]:
c=comments.str.translate(str.maketrans(' ', ' ', string.punctuation))
c.head()

## Removing '\n' and digits

In [None]:
c=c.str.translate(str.maketrans(' ', ' ', '\n'))
c=c.str.translate(str.maketrans(' ', ' ', digits))
c.head()

## Split combined words 
Example - Convert 'Whoareyou' to 'Who are you'.

In [None]:
c=c.apply(lambda tweet: re.sub(r'([a-z])([A-Z])',r'\1 \2',tweet))
c.head()

## Convert to lowercase

In [None]:
c=c.str.lower()
c.head()

## Split each sentence using delimiter

In [None]:
c=c.str.split()
c.head()

## Remove Stop Words

In [None]:
stop = set(stopwords.words('english'))
c=c.apply(lambda x: [item for item in x if item not in stop])
c.head()    

## Convert Word to Base Form or Lematize

In [None]:
from tqdm import tqdm
lemmatizer = WordNetLemmatizer()
com=[]
for y in tqdm(c):
    new=[]
    for x in y:
        z=lemmatizer.lemmatize(x)
        z=lemmatizer.lemmatize(z,'v')
        new.append(z)
    y=new
    com.append(y)

## Lemmatized form is an Array. Convert it to DataFrame using stored index.

In [None]:
clean_data=pd.DataFrame(np.array(com), index=comments.index,columns={'comment_text'})
clean_data['comment_text']=clean_data['comment_text'].str.join(" ")
print(clean_data.head())
train_clean_data=clean_data.loc[train_comments_index]
test_clean_data=clean_data.drop(train_comments_index,axis=0).reset_index(drop=True)
print("PreProcessed Train Data : ",train_clean_data.head(5))
print("PreProcessed Test Data : ",test_clean_data.head(5))
frames=[train_clean_data,labels]
train_result = pd.concat(frames,axis=1)
frames=[test.iloc[:,0],test_clean_data]
test_result = pd.concat(frames,axis=1)
print(train_result.head())
print(test_result.head())

## Are the labels inter-related?

In [None]:
temp_df=train_result.iloc[:,2:-1]
corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True)

## Convert a collection of raw documents to a matrix of TF-IDF features

In [None]:
tf_idf = TfidfVectorizer(max_features=50000, min_df=2)
tfidf_train = tf_idf.fit_transform(train_result['comment_text'])
tfidf_test = tf_idf.transform(test_result['comment_text'])
# import pickle
# pickle.dump(tf_idf.vocabulary_,open("feature.pkl","wb"))

## Neural network implementation : Building the model

In [None]:
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
model = Sequential()
model.add(Dense(100,activation='relu',input_shape=(50000,)))
model.add(Dense(100,activation='relu'))
model.add(Dense(6,activation='sigmoid'))
model.compile(optimizer='adam',loss='mean_squared_error',metrics=['accuracy'])


## Fit the training data using the vectorized matrix

In [None]:
model.fit(tfidf_train, train_result[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values)


## Predic the probability of each label in the test dataset

In [None]:
y_pred = model.predict(tfidf_test)

## Save the output as csv file

In [None]:
dict = {
    'id': test_result.id.values,
    'toxic' : y_pred[:,0],
    'severe_toxic' : y_pred[:,1],
    'obscene':y_pred[:,2],
    'threat':y_pred[:,3],
    'insult':y_pred[:,4],
    'identity_hate':y_pred[:,5]
}
ans = pd.DataFrame(dict)
ans
ans.to_csv('Submit1.csv',index=False)

## Try and classify your comment

In [None]:
s = input()
c = s.translate(str.maketrans(' ', ' ', string.punctuation))
c = c.translate(str.maketrans(' ', ' ', '\n'))
c = c.translate(str.maketrans(' ', ' ', digits))
c = re.sub(r'([a-z])([A-Z])', r'\1 \2', c)
c = c.lower()
c = c.split()
stop = set(stopwords.words('english'))
c = [item for item in c if item not in stop]
from tqdm import tqdm
lemmatizer = WordNetLemmatizer()
com = []
for y in tqdm(c):
    new = []
    for x in y:
        z = lemmatizer.lemmatize(x)
        z = lemmatizer.lemmatize(z, 'v')
        new.append(z)
    y = new
    com.append(y)
clean = ""
for i in com:
    t = ''
    clean += t.join(i) + " "
test = tf_idf.transform(np.array([clean]))
y_pred = model.predict(test)
pred = pd.DataFrame(
{
    'label':labels.columns,
    'probability':y_pred[0]
})
# print(train.columns)
print(pred)
