![](https://i.imgur.com/Va5BVFq.png)

# About the competiotion:
This is Jigsaw's fourth Kaggle competition. The goal of this competition is to rank comments by the severity of toxicity. Each comment is given a rating according to its relative toxicity, which you must assign. The numerical value of comments that have a greater degree of toxicity should be greater than comments that have a lower degree of toxicity.

<p align="center">
<img width = "300" src="https://i.imgur.com/fRWxwmw.jpg">
</p>

It is ironic, however, that there are no training data for this competition. The majority of people use the training data from previous competitions. Nevertheless, previous competitions attempted to predict the probability of a comment's toxicity rather than its degree of severity. Don't get all flustered, when you see the dataset provided for this competition cause it's gonna have some profane, vulgar, or offensive text.



In [None]:
import os
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense


In [None]:
train_file = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
test_file = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
cleaned_text = "../input/cleaned-toxic-comments/train_preprocessed.csv"
submission = "../input/jigsaw-toxic-severity-rating/sample_submission.csv"
# train_df = pd.read_csv(train_file)
# train_df.head()

In [None]:
cleaned_text_df = pd.read_csv(cleaned_text)
cleaned_text_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

combined_comments =  pd.read_csv(cleaned_text).comment_text.tolist()

In [None]:
for i in range(500,510):
    print(combined_comments[i])
    print('--------------------------------------------------------------------------------')

In [None]:
len(cleaned_text_df.toxicity)

In [None]:
# df_x = combined_comments
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in combined_comments] 
type(onehot_repr)

In [None]:
sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

In [None]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())

In [None]:
# df_y = cleaned_text_df['toxicity']

In [None]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
df_y2 = encode.fit_transform(pd.read_csv(cleaned_text)['toxicity'])
type(df_y2)

In [None]:
df_y2.shape

In [None]:
embedded_docs.shape

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(df_y2)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
#we are feeding the 
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5,batch_size=64)

In [None]:
# score = model.evaluate(X_test,Y_test)

In [None]:
# y_pred_model = model.predict(X_test)

In [None]:
# print("Accuracy: %.2f%%" % (score[1]*100))
# diff = Y_test - y_pred_model
mae = np.mean(abs(Y_test - model.predict(X_test)))
mse = np.mean((Y_test - model.predict(X_test))**2)
rmse = np.sqrt(mse)
print(mae)
print(mse)
print(rmse)

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

lstm_cnn=Sequential()
lstm_cnn.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
lstm_cnn.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
lstm_cnn.add(MaxPooling1D(pool_size=2))
lstm_cnn.add(LSTM(100))
lstm_cnn.add(Dense(1))
lstm_cnn.compile(loss='mean_squared_error', optimizer='adam')
print(lstm_cnn.summary())

In [None]:
lstm_cnn.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5,batch_size=64)

In [None]:
# score = model.evaluate(X_test,Y_test)

In [None]:
# print("Accuracy: %.2f%%" % (score[1]*100))

In [None]:
# print("Accuracy: %.2f%%" % (score[1]*100))
# diff = Y_test - y_pred_model
mae = np.mean(abs(Y_test - lstm_cnn.predict(X_test)))
mse = np.mean((Y_test - lstm_cnn.predict(X_test))**2)
rmse = np.sqrt(mse)
print(mae)
print(mse)
print(rmse)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np

train_df = pd.read_csv(train_file)
comments=[str(x) for x in train_df['less_toxic'].tolist()+train_df['more_toxic'].tolist()]
df_t = pd.DataFrame({'comments':comments})


snow = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, len(df_t)):
    review = re.sub('[^a-zA-Z]', ' ', df_t['comments'][i])
    review = review.lower()
    review = review.split()
    
    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
# df_x = combined_comments
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
type(onehot_repr)

sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

sub_test =np.array(embedded_docs)

In [None]:
sub_pred = model.predict(sub_test)

In [None]:
sub = pd.read_csv(submission)
sub['score'] = sub['score'].rank(method='first')
sub.to_csv('submission.csv', index=False)