In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for data cleaning
import string
from wordcloud import WordCloud,STOPWORDS
from matplotlib.lines import Line2D
# for stopwords Removal
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# for calculating Polarity and Subjectivity
from textblob import TextBlob
from nltk.stem.porter import PorterStemmer
# for regular expressions
# for calculating Polarity and Subjectivity
from textblob import TextBlob

# for data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import re
palette = ["#9bf6ff","#ffadad"]
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [None]:
df_vall = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_vall.head()

In [None]:
df_vall.shape

In [None]:
less_toxic = pd.DataFrame()
less_toxic['text'] = df_vall['less_toxic'].tolist()
less_toxic['label'] = "Less Toxic"

more_toxic = pd.DataFrame()
more_toxic['text'] = df_vall['more_toxic'].tolist()
more_toxic['label'] = "More Toxic"

toxicity_text = pd.concat([less_toxic, more_toxic], ignore_index=True)
toxicity_text.head()

## Cleaning Data

In [None]:
# First lets remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

toxicity_text['text'] = toxicity_text['text'].apply(punctuation_removal)

In [None]:
# lets make a function to remove Numbers from the reviews
import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ''.join(list_text_new)

toxicity_text['text'] = toxicity_text['text'].apply(drop_numbers)

In [None]:
toxicity_text['text'].head(10) 

In [None]:
df_com = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_com.head()

In [None]:
submission_csv = pd.read_csv("../input/jigsaw-toxic-severity-rating/sample_submission.csv")
submission_csv.head()


In [None]:
# lets check the Descriptive Summary of the Dataset
toxicity_text.describe()

In [None]:
# lets check the summary of Date, Variation and Reviews
toxicity_text.describe(include = 'object')

In [None]:
# lets check the Value Counts for Variation 
toxicity_text['text'].value_counts()

##  Feature Engineering

### Text Polarity &  Text Subjectivity

In [None]:
# Lets calculate the length of the Reviews
toxicity_text['length'] = toxicity_text['text'].apply(len)

In [None]:
# Lets calculate the Polarity of the Reviews
def get_polarity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    pol = textblob.sentiment.polarity
    return pol

# lets apply the function
toxicity_text['polarity'] = toxicity_text['text'].apply(get_polarity)

In [None]:
# Lets calculate the Subjectvity of the Reviews
def get_subjectivity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    subj = textblob.sentiment.subjectivity
    return subj

# lets apply the Function
toxicity_text['subjectivity'] = toxicity_text['text'].apply(get_subjectivity)

In [None]:
## lets summarize the Newly Created Features
toxicity_text[['length','polarity','subjectivity']].describe()

In [None]:
# Lets calculate the Polarity of the Reviews
def get_polarity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    pol = textblob.sentiment.polarity
    return pol

# lets apply the function
toxicity_text['polarity'] = toxicity_text['text'].apply(get_polarity)

In [None]:
# Lets calculate the Subjectvity of the Reviews
def get_subjectivity(text):
    textblob = TextBlob(str(text.encode('utf-8')))
    subj = textblob.sentiment.subjectivity
    return subj

# lets apply the Function
toxicity_text['subjectivity'] = toxicity_text['text'].apply(get_subjectivity)

In [None]:
## Visualizing Polarity and Subjectivity

plt.rcParams['figure.figsize'] = (10, 4)

plt.subplot(1, 2, 1)
sns.distplot(toxicity_text['polarity'])

plt.subplot(1, 2, 2)
sns.distplot(toxicity_text['subjectivity'])

plt.suptitle('Distribution of Polarity and Subjectivity')
plt.show()

In [None]:
# lets check relation between Polarity and Subjectivity

sns.scatterplot(toxicity_text['polarity'], toxicity_text['subjectivity'])
plt.title('Polarity vs Subjectivity')
plt.show()

## Visualizing the Most Frequent Words

In [None]:
## Visualizing the Most Frequent Words

from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(toxicity_text['text'])
sum_words = words.sum(axis=0)


words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])

plt.style.use('fivethirtyeight')
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.head(20).plot(x='word', y='freq', kind='bar', figsize=(15, 6), color = color)
plt.title("Most Frequently Occuring Words - Top 20")
plt.show()

## Visualizing the Least Frequent Words

In [None]:
## Visualizing the Least Frequent Words

from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(toxicity_text['text'])
sum_words = words.sum(axis=0)


words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])

plt.style.use('fivethirtyeight')
color = plt.cm.ocean(np.linspace(0, 1, 20))
frequency.tail(20).plot(x='word', y='freq', kind='bar', figsize=(15, 6), color = color)
plt.title("Least Frequently Occuring Words - Top 20")
plt.show()

In [None]:
# Visualizing the BiGrams
 #load in all the modules we're going to need
import nltk
import collections

# function for making ngrams
from nltk.util import ngrams 
text = str(toxicity_text['text'])
tokenized = text.split()

# and get a list of all the bi-grams
esBigrams = ngrams(tokenized, 2)

# get the frequency of each bigram in our corpus
esBigramFreq = collections.Counter(esBigrams)

# what are the ten most popular ngrams in this Spanish corpus?
esBigramFreq.most_common(10)

In [None]:
# lets plot the Wordscloud

cv = CountVectorizer(stop_words = 'english')
words = cv.fit_transform(toxicity_text['text'])
sum_words = words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)

wordcloud = WordCloud(background_color = 'black', width = 2000, height = 2000).generate_from_frequencies(dict(words_freq))

plt.style.use('fivethirtyeight')
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.title("Vocabulary from Reviews", fontsize = 20)
plt.show()

In [None]:
# lets check the Distribution of Rating and Useful Count

plt.rcParams['figure.figsize'] = (15, 4)

plt.subplot(1, 2, 1)
sns.distplot(toxicity_text['polarity'])

plt.subplot(1, 2, 2)
sns.distplot(toxicity_text['subjectivity'])

plt.suptitle('Distribution of Rating and Useful Count \n ', fontsize = 20)
plt.show()



In [None]:
# lets check the Impact of Ratings on Usefulness

plt.rcParams['figure.figsize'] = (15, 4)
sns.barplot(toxicity_text['polarity'], toxicity_text['subjectivity'], palette = 'hot')
plt.grid()
plt.xlabel('\n Ratings')
plt.ylabel('Count\n', fontsize = 20)
plt.title('\n Rating vs Usefulness \n', fontsize = 20)
plt.show()

## 4. Feature Extraction

In [None]:
## Cleaning the Data

corpus = []

for i in range(0, 3150):
    review = re.sub('[^a-zA-Z]', ' ', toxicity_text['text'][i])  ## Removing all Unecessary items
    review = review.lower()                                         ## Converting into Lower Case
    review = review.split()
    ps = PorterStemmer()                                            ## Stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]  ## Removing Stopwords
    review = ' '.join(review)
    corpus.append(review)

## Cleaning the Text

In [None]:
# as it is clear that the reviews have so many unnecassry things such as Stopwords, Punctuations, numbers etc

# First lets remove Punctuations from the Reviews
def punctuation_removal(messy_str):
    clean_list = [char for char in messy_str if char not in string.punctuation]
    clean_str = ''.join(clean_list)
    return clean_str

toxicity_text['text'] = toxicity_text['text'].apply(punctuation_removal)

In [None]:
# Now lets Remove the Stopwords also

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop = stopwords.words('english')
stop.append("i'm")

stop_words = []

for item in stop: 
    new_item = punctuation_removal(item)
    stop_words.append(new_item) 

def stopwords_removal(messy_str):
    messy_str = word_tokenize(messy_str)
    return [word.lower() for word in messy_str 
            if word.lower() not in stop_words ]

toxicity_text['text'] = toxicity_text['text'].apply(stopwords_removal)

In [None]:
# lets remove the Numbers also

import re
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ' '.join(list_text_new)

toxicity_text['text'] = toxicity_text['text'].apply(drop_numbers)

### Calculating the Sentiment from Reviews

In [None]:
# for using Sentiment Analyzer we will have to dowload the Vader Lexicon from NLTK

import nltk
nltk.download('vader_lexicon')

In [None]:
# lets calculate the Sentiment from Reviews

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

train_sentiments = []

for i in toxicity_text['text'] :
    train_sentiments.append(sid.polarity_scores(i).get('compound'))
    
train_sentiments = np.asarray(train_sentiments)
toxicity_text['sentiment'] = pd.Series(data=train_sentiments)
toxicity_text['sentiment'] 

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatise(text):
    text_tokens = word_tokenize(text)
    text_lemm = [lemmatizer.lemmatize(word) for word in text_tokens]
    return ' '.join(text_lemm)

toxicity_text['text']  = toxicity_text['text'] .apply(lemmatise)

In [None]:
toxicity_text['text'] .values

# Lets calculate the Polarity of the Reviews

In [None]:
# Lets calculate the Polarity of the Reviews
def get_polarity(text):
    textblob = TextBlob(str(text))
    pol = textblob.sentiment.polarity
    if(pol==0):
        return "Neutral"
    elif(pol>0 and pol<=0.3):
        return "Weakly Positive"
    elif(pol>0.3 and pol<=0.6):
        return "Positive"
    elif(pol>0.6 and pol<=1):
        return "Strongly Positive"
    elif(pol>-0.3 and pol<=0):
        return "Weakly Negative"
    elif(pol>-0.6 and pol<=-0.3):
        return "Negative"
    elif(pol>-1 and pol<=-0.6):
        return "Strongly Negative"
    
toxicity_text['polarity'] = toxicity_text['text'] .apply(get_polarity)

In [None]:
toxicity_text['polarity'].value_counts()

## Show Less Toxic Comments

In [None]:
# color function for the wordcloud
def color_wc(word=None,font_size=None,position=None, orientation=None,font_path=None, random_state=None):
    h = int(360.0 * 255.0 / 255.0)
    s = int(190.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(40, 80)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)


fig = plt.gcf()
fig.set_size_inches(16, 8)
wc = WordCloud(stopwords=STOPWORDS,background_color="black", contour_width=2, contour_color='orange',width=1500, height=750,color_func=color_wc ,max_words=150, max_font_size=256,random_state=42)
wc.generate(' '.join(less_toxic['text']))
fig = plt.imshow(wc, interpolation="bilinear")
fig = plt.axis('off')

In [None]:
fig = plt.gcf()
fig.set_size_inches(16, 8)
wc = WordCloud(stopwords=STOPWORDS,background_color="black", contour_width=2, contour_color='read',width=1500, height=750,color_func=color_wc,max_words=150, max_font_size=256,random_state=42)
wc.generate(' '.join(more_toxic['text']))
fig = plt.imshow(wc, interpolation="bilinear")
fig = plt.axis('off')

## Modeling

In [None]:
import os
import warnings
warnings.filterwarnings("ignore")                     #Ignoring unnecessory warnings

import numpy as np                                  #for large and multi-dimensional arrays
import pandas as pd                                 #for data manipulation and analysis
import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer

from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from gensim.models import Word2Vec                                   #For Word2Vec

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense

In [None]:
train_file = "../input/jigsaw-toxic-severity-rating/validation_data.csv"
test_file = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"
cleaned_text = "../input/cleaned-toxic-comments/train_preprocessed.csv"
submission = "../input/jigsaw-toxic-severity-rating/sample_submission.csv"

In [None]:
cleaned_text_df = pd.read_csv(cleaned_text)
cleaned_text_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

combined_comments =  pd.read_csv(cleaned_text).comment_text.tolist()

In [None]:
for i in range(500,510):
    print(combined_comments[i])
    print('--------------------------------------------------------------------------------')

In [None]:
# df_x = combined_comments
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in combined_comments] 
type(onehot_repr)
sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

## Creating model

In [None]:
## Creating model
embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())

In [None]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
df_y2 = encode.fit_transform(pd.read_csv(cleaned_text)['toxicity'])
type(df_y2)

In [None]:
import numpy as np
X_final=np.array(embedded_docs)
y_final=np.array(df_y2)

In [None]:
print(X_final.shape)
print(y_final.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
#we are feeding the 
model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5,batch_size=64)

In [None]:
score = model.evaluate(X_test,Y_test)
score

In [None]:
y_pred_model = model.predict(X_test)
y_pred_model

In [None]:
# print("Accuracy: %.2f%%" % (score[1]*100))
# diff = Y_test - y_pred_model
mae = np.mean(abs(Y_test - model.predict(X_test)))
mse = np.mean((Y_test - model.predict(X_test))**2)
rmse = np.sqrt(mse)
print(mae)
print(mse)
print(rmse)

In [None]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

lstm_cnn=Sequential()
lstm_cnn.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
lstm_cnn.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
lstm_cnn.add(MaxPooling1D(pool_size=2))
lstm_cnn.add(LSTM(100))
lstm_cnn.add(Dense(1))
lstm_cnn.compile(loss='mean_squared_error', optimizer='adam')
print(lstm_cnn.summary())

In [None]:
lstm_cnn.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=5,batch_size=64)

In [None]:
score = model.evaluate(X_test,Y_test)
score

In [None]:
y_pred_model = model.predict(X_test)
y_pred_model

In [None]:
# print("Accuracy: %.2f%%" % (score[1]*100))
# diff = Y_test - y_pred_model
mae = np.mean(abs(Y_test - lstm_cnn.predict(X_test)))
mse = np.mean((Y_test - lstm_cnn.predict(X_test))**2)
rmse = np.sqrt(mse)
print(mae)
print(mse)
print(rmse)

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np

train_df = pd.read_csv(train_file)
comments=[str(x) for x in train_df['less_toxic'].tolist()+train_df['more_toxic'].tolist()]
df_t = pd.DataFrame({'comments':comments})


snow = nltk.stem.SnowballStemmer('english')

corpus = []
for i in range(0, len(df_t)):
    review = re.sub('[^a-zA-Z]', ' ', df_t['comments'][i])
    review = review.lower()
    review = review.split()
    
    review = [snow.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    
# df_x = combined_comments
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
type(onehot_repr)

sent_length=400
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

sub_test =np.array(embedded_docs)

In [None]:
sub_pred = model.predict(sub_test)

In [None]:
sub = pd.read_csv(submission)
sub['score'] = sub['score'].rank(method='first')
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head(10)