### Importing important Libraries

In [None]:
# !pip install tqdm

In [None]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

import re
import pandas as pd
from tqdm import tqdm
import time

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Libraries for text Preprocessing
import nltk 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize 
from nltk.stem.porter import PorterStemmer


from wordcloud import WordCloud,STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer

### Importing Datasets

In [None]:
val_df = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
comment_df =pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sample_sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
# val_df.head()

In [None]:
# val_df.shape
# comment_df.shape
# sample_sub.shape
# comment_df
# sample_sub

### Converting the all reviews into the lower case

In [None]:
val_df['pre_process_less_toxic'] = val_df['less_toxic'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
val_df['pre_process_more_toxic'] = val_df['more_toxic'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
comment_df['pre_process'] = comment_df['text'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))
# val_df.head()
# comment_df.head()

### Remove the HTML tags and URLs from the text if present.

In [None]:
from bs4 import BeautifulSoup
val_df['pre_process_less_toxic'] = val_df['pre_process_less_toxic'].apply(lambda x: BeautifulSoup(x).get_text())
val_df['pre_process_less_toxic'] = val_df['pre_process_less_toxic'].apply(lambda x: re.sub(r"http\S+", "", x))

val_df['pre_process_more_toxic'] = val_df['pre_process_more_toxic'].apply(lambda x: BeautifulSoup(x).get_text())
val_df['pre_process_more_toxic'] = val_df['pre_process_more_toxic'].apply(lambda x: re.sub(r"http\S+", "", x))

comment_df['pre_process'] = comment_df['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
comment_df['pre_process'] = comment_df['pre_process'].apply(lambda x: re.sub(r"http\S+", "", x))

# val_df.head()
# comment_df.head()

In [None]:
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s
val_df['pre_process_less_toxic']=val_df['pre_process_less_toxic'].apply(lambda x:contractions(x))
val_df['pre_process_more_toxic']=val_df['pre_process_more_toxic'].apply(lambda x:contractions(x))
comment_df['pre_process']=comment_df['pre_process'].apply(lambda x:contractions(x))

# val_df.head()
# comment_df.head()

### Removing Special Charachter

In [None]:
val_df['pre_process_less_toxic']=val_df['pre_process_less_toxic'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
val_df['pre_process_more_toxic']=val_df['pre_process_more_toxic'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
comment_df['pre_process']=comment_df['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

# val_df.head()
# comment_df.head()

### Remove the extra spaces between the words

In [None]:
val_df['pre_process_less_toxic']=val_df['pre_process_less_toxic'].apply(lambda x: re.sub(' +', ' ', x))
val_df['pre_process_more_toxic']=val_df['pre_process_more_toxic'].apply(lambda x: re.sub(' +', ' ', x))
comment_df['pre_process']=comment_df['pre_process'].apply(lambda x: re.sub(' +', ' ', x))

### Remove the stop words by using the NLTK package

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
val_df['pre_process_less_toxic'] = val_df['pre_process_less_toxic'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
val_df['pre_process_more_toxic'] = val_df['pre_process_more_toxic'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
comment_df['pre_process'] = comment_df['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

### Perform lemmatization using the wordnet lemmatizer

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
val_df['pre_process_less_toxic']=val_df['pre_process_less_toxic'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
val_df['pre_process_more_toxic']=val_df['pre_process_more_toxic'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
comment_df['pre_process']=comment_df['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
val_df.head()
comment_df.head()

In [None]:
comment_df['total_words_pre_process'] = comment_df['pre_process'].str.split().str.len()
comment_df

In [None]:
df_id = comment_df['comment_id']

In [None]:
df_num = comment_df['total_words_pre_process']
df_num

In [None]:
val_df

### Feature Extraction

In [None]:
# less_toxic = val_df[ train['sentiment'] == 'Positive']
less_toxic = val_df['pre_process_less_toxic']
# train_neg = train[ train['sentiment'] == 'Negative']
more_toxic = val_df['pre_process_more_toxic']

def wordcloud_draw(data, color = 'black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
print("Less Toxic words")
wordcloud_draw(less_toxic,'white')

In [None]:
new_words = ['nipple', 'fucksex', 'shit']
stop1 = stopwords.words('english')
stop1.extend(new_words)
val_df['pre_process_less_toxic'] = val_df['pre_process_less_toxic'].apply(lambda x: " ".join([x for x in x.split() if x not in stop1]))

In [None]:
print("Less Toxic words")
wordcloud_draw(less_toxic,'white')

In [None]:
print("More Toxic words")
wordcloud_draw(more_toxic)

### Merging the three text column in one dataframe

In [None]:
val_df.shape

In [None]:
df_model = pd.concat([val_df['pre_process_less_toxic'], val_df['pre_process_more_toxic']], axis=0)
df_model.shape

In [None]:
df_model = pd.concat([df_model, comment_df['text']], axis=0)
df_model.shape

In [None]:
print('TFIDF Vectorizer……')
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer(max_features=5000,
                            max_df=0.8,
                            min_df=5)
tf_less_toxic = vectorizer.fit_transform(df_model)
tf_df_1 = vectorizer.transform(df_model)
comment_df.shape
tf_df_1.shape

In [None]:
df_sprs = pd.DataFrame.sparse.from_spmatrix(tf_df_1)
df_sprs

In [None]:
df_train = df_sprs.loc[0:60215, :]
df_test = df_sprs.loc[60216: , :]
print('Train Shape: ', df_train.shape)
print('Test Shape: ', df_test.shape)

In [None]:
df_train_LT = df_train.loc[:30107, :]
df_train_MT = df_train.loc[30108:, :]
print('Less Toxic Train Shape: ',df_train_LT.shape)
print('More Toxic Train Shape: ', df_train_MT.shape)

In [None]:
df_train_LT['Sum_1'] = df_train_LT.loc[ : ,].sum(axis = 1)

In [None]:
df_train_MT['Sum_2'] = df_train_MT.loc[ : ,].sum(axis = 1)

In [None]:
df_train_LT['new_score'] = np.interp(df_train_LT['Sum_1'], [df_train_LT['Sum_1'].min(),df_train_LT['Sum_1'].max()], [.1,.5])
df_train_LT

In [None]:
df_train_MT['new_score'] = np.interp(df_train_MT['Sum_2'], [df_train_MT['Sum_2'].min(),df_train_MT['Sum_2'].max()], [.6,1])
df_train_MT

In [None]:
y_train_1 = df_train_LT['new_score']
y_train_2 = df_train_MT['new_score']

In [None]:
df_LT = df_train_LT
df_MT = df_train_MT

In [None]:
df_LT.head()

In [None]:
df_MT.head()

In [None]:
df_LT.drop(['Sum_1','new_score'], axis=1, inplace=True)
df_MT.drop(['Sum_2','new_score'], axis=1, inplace=True)

In [None]:
X = pd.concat([df_LT, df_MT], axis=0)
y = pd.concat([y_train_1, y_train_2], axis=0)
print('X Shape: ', X.shape)
print('y Shape: ', y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=48)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.svm import SVR

In [None]:
nrow, ncol = X_train.shape
print('No of Row: ',nrow)
print('No of Columns: ',ncol)

In [None]:
def scores(i):
    lin = i()
    lin.fit(X_train, y_train)
    y_pred = lin.predict(X_test)
    lin_r = r2_score(y_test, y_pred)
    s.append(lin_r)

    adj_r2_score = 1 - (((1-lin_r)*(nrow-1))/(nrow-1-ncol))
    s1.append(adj_r2_score)

    errors = abs(y_test - y_pred)
    err = (y_test + y_pred)/2
    smape = np.mean((errors/err)*100)
    
    mape = 100 * np.mean(errors / y_test)
    accuracy = 100 - mape
    s2.append(accuracy)
    s3.append(mape)
    s4.append(smape)    

    MAE = np.abs(y_test - y_pred).mean()
    s5.append(MAE)

    MSE = ((y_test - y_pred)**2).mean()
    s6.append(MSE)

    RMSE = np.sqrt(((y_test - y_pred)**2).mean())
    s7.append(RMSE)

algos = [LinearRegression]#,RandomForestRegressor, DecisionTreeRegressor]
s = []
s1 = []
s2 = []
s3 = []
s4 = []
s5 = []
s6 = []
s7 = []
for i in algos:
    scores(i)

In [None]:
models = pd.DataFrame({
    'Method': ['LinearRegression'],#,'RandomForestRegressor', 'DecisionTreeRegressor'],
    'r2 Scores' : [s[0]],# s[1], s[2], s[3], s[4], s[5], s[6], s[7]],
    'Ajd r2 Score' : [s1[0]],# s1[1], s1[2], s1[3], s1[4], s1[5], s1[6], s1[7]],
    'Accuracy' : [s2[0]],# s2[1], s2[2], s2[3], s2[4], s2[5], s2[6], s2[7]],
    'MAPE' : [s3[0]],# s3[1], s3[2], s3[3], s3[4], s3[5], s3[6], s3[7]],
    'SMAPE' : [s4[0]],# s4[1], s4[2], s4[3], s4[4], s4[5], s4[6], s4[7]],
    'MAE' : [s5[0]],# s5[1], s5[2], s5[3], s5[4], s5[5], s5[6], s5[7]],
    'MSE' : [s6[0]],# s6[1], s6[2], s6[3], s6[4], s6[5], s6[6], s6[7]],
    'RMSE' : [s7[0]]# s7[1], s7[2], s7[3], s7[4], s7[5], s7[6], s7[7]]
})
models.sort_values(by='r2 Scores', ascending=False)

In [None]:
rfr_Model = RandomForestRegressor()
rfr_Model.fit(X_train, y_train)
y_pred = rfr_Model.predict(X_test)
r2Score = r2_score(y_test, y_pred)
print('R2 Score',r2Score)         #0.9730024554109795,    0.973003845604028,    0.9781682423157221  
print('--------------------------------------------------------------')
x = 1-r2Score
y = nrow-1
z = nrow-1-ncol
adj_r2_score = 1 - ((x*y)/z)
print('Adjusted r2 Score',adj_r2_score)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
err = (y_test + y_pred)/2
smape = np.mean((errors/err)*100)
print('sMAPE',smape)
print('--------------------------------------------------------------')
errors = abs(y_test - y_pred)
mape = 100 * np.mean(errors / y_test)
accuracy = 100 - mape
print('Accuracy = {:0.2f}%.'.format(accuracy))
print('--------------------------------------------------------------')
print('MAE', np.abs(y_pred-y_test).mean())
print('--------------------------------------------------------------')
print('RMSE: ', np.sqrt(((y_test - y_pred)**2).mean()))
print('--------------------------------------------------------------')
print('MSE: ', ((y_test - y_pred)**2).mean())

In [None]:
df_test_preds = rfr_Model.predict(df_test)

In [None]:
score = pd.DataFrame(df_test_preds, columns=['score'])

In [None]:
id = pd.DataFrame(comment_df['comment_id'], columns=['comment_id'])

In [None]:
df_result = pd.concat([id.reset_index(drop=True), score.reset_index(drop=True)], axis=1)

In [None]:
df_result.to_csv('submission.csv', index=False)

In [None]:
# vectorizer = TfidfVectorizer()
# model = vectorizer.fit_transform(val_df['pre_process_less_toxic'])
    
#     # We use the inverse_transform which returns the 
#     # terms per document with nonzero entries
# inverse_model = vectorizer.inverse_transform(model)
    
#     # Each line in the inverse model corresponds to a document 
#     # and contains a list of feature names (the terms).
#     # As we want to rank the documents we tranform the list 
#     # of feature names to a number of features
#     # that each document is represented by.
# inverse_model_count = list(map(lambda doc_vec: len(doc_vec), inverse_model))
    
#     # As we are going to sort the list, we need to keep track of the 
#     # document id (its index in the corpus), so we create tuples with 
#     # the list index of each item before we sort the list.
# inverse_model_count_tuples = list(zip(range(len(inverse_model_count)),
#                                           inverse_model_count))
    
#     # Then we sort the list by the count of terms 
#     # in each document (the second component)
# max_features = 10000
# top_documents_tuples = sorted(inverse_model_count_tuples, 
#                                   key=lambda item: item[1], 
#                                   reverse=True)[:max_features]
    
#     # We are interested only in the document ids (the first tuple component)
# top_documents, _ = zip(*top_documents_tuples)
    
#     # Having the top_documents ids we can slice the initial model 
#     # to keep only the documents indicated by the top_documents list
# reduced_model = model[top_documents]

In [None]:
# print('TFIDF Vectorizer……')
# tf_more_toxic = vectorizer.fit_transform(val_df['pre_process_more_toxic'])
# tf_comment_df_2 = vectorizer.transform(comment_df['pre_process'])
# comment_df.shape
# tf_comment_df_2.shape

In [None]:
# tf_comment_df_2.todense()

In [None]:
# df_1 = pd.DataFrame.sparse.from_spmatrix(tf_comment_df_1)
# df_2 = pd.DataFrame.sparse.from_spmatrix(tf_comment_df_2)
# df_1.head()
# df_2.head()

In [None]:
# df_1['Sum_1'] = df_1.loc[ : ,].sum(axis = 1)
# df_1

In [None]:
# df_1_sum = df_1['Sum_1']

In [None]:
df_2['Sum_2'] = df_2.loc[ : ,].sum(axis = 1)
# df_2

In [None]:
# df_2_sum = df_2['Sum_2']

In [None]:
# df_final = pd.concat([df_id.reset_index(drop=True), df_num.reset_index(drop=True), df_1_sum.reset_index(drop=True), df_2_sum.reset_index(drop=True)], axis=1)
# df_final.head(50)

In [None]:
# df_final['less_tox'] = (df_final['Sum_1'])/10
# df_final['more_tox'] = (df_final['Sum_2'])/10
# df_final.head()

In [None]:
# submit_df = df_final[['comment_id', 'more_tox']]
# submit_df.head()

In [None]:
# submit_df['more_tox'] = submit_df['more_tox'] + 0.1

In [None]:
# submit_df = submit_df.rename(columns = {'more_tox': 'score'}, inplace = False)
# submit_df.head()

In [None]:
# df_final.drop('less_tox', inplace=True, axis=1)
# df_final['less_tox'] = abs((df_final['total_words_pre_process'] * df_final['Sum_1'])/100 - 10)
# df_final

In [None]:
# df_final['more_tox'] = abs((df_final['total_words_pre_process'] * df_final['Sum_2'])/100 - 10)
# df_final

In [None]:
# df_final['score'] = df_final.apply(lambda x: x['Column1'] if x['Column1'] <=
#                      x['Column2'] and x['Column1']
#                      <= x['Column3'] else np.nan, axis=1)

In [None]:
# def que(x):
#     if df_final['less_tox'] > df_final['more_tox']:
#         return (10 - df_final['less_tox'])/10
#     else:
#         return (df_final['more_tox']/10)
    
# df_final['score'] = df_final.apply(que, axis=1)

### Removing Punctuation, Numbers, and Special Charachters

In [None]:
# val_df['less_toxic_1'] = val_df['less_toxic'].str.replace('[^a-zA-Z#]+', ' ')
# val_df['more_toxic_1'] = val_df['more_toxic'].str.replace('[^a-zA-Z#]+', ' ')
# comment_df['text_1'] = comment_df['text'].str.replace('[^a-zA-Z#]+', ' ')
# val_df.head()

### Removing Stopwords

In [None]:
# nltk.download('stopwords')

In [None]:
# stop = stopwords.words('english')
# stop_words = set(stopwords.words('english'))

# processed = val_df['more_toxic_1'].apply(lambda x: ' '.join(txt for txt in x.split() if txt not in stop_words))
# val_df['more_toxic_1'] = processed

# processed_1 = val_df['less_toxic_1'].apply(lambda x: ' '.join(txt for txt in x.split() if txt not in stop_words))
# val_df['less_toxic_1'] = processed_1

# processed_2 = comment_df['text_1'].apply(lambda x: ' '.join(txt for txt in x.split() if txt not in stop_words))
# comment_df['text_1'] = processed_2
# val_df.head()

### Tokenization

In [None]:
# val_df['less_toxic_1'] = val_df['less_toxic_1'].str.replace('#', '')
# tokenize = val_df['less_toxic_1'].apply(lambda x: x.split())
# val_df['tokens_less_toxic'] = tokenize

# val_df['more_toxic_1'] = val_df['more_toxic_1'].str.replace('#', '')
# tokenize = val_df['more_toxic_1'].apply(lambda x: x.split())
# val_df['tokens_more_toxic'] = tokenize

# comment_df['text_1'] = comment_df['text_1'].str.replace('#', '')
# tokenize = comment_df['text_1'].apply(lambda x: x.split())
# comment_df['text_tokens'] = tokenize
# val_df.head()

In [None]:
# data = val_df['less_toxic_1']
# sentences = []
# vocab = []
# for sent in data:
#     for i in sent:
#         x = word_tokenize(sent)
#         sentence = [w.lower() for w in x if w.isalpha() ]
#         sentences.append(sentence)
#         for word in sentence:
#             if word not in vocab:
#                 vocab.append(word)
 
# #number of words in the vocab
# len_vector = len(vocab)

In [None]:
# def get_vocab(data):
#     data = data
#     sentences = []
#     vocab = []
#     for sent in data:
#         for i in sent:
#             x = word_tokenize(sent)
#             sentence = [w.lower() for w in x if w.isalpha() ]
#             sentences.append(sentence)
#             for word in sentence:
#                 if word not in vocab:
#                     vocab.append(word)
                    
#     return vocab

In [None]:
# for i in tqdm(val_df['less_toxic_1'], total=15000):
#     less_toxic_vocab = get_vocab(val_df['less_toxic_1'])
#     time.sleep(0.3)

In [None]:
# less_toxic_vocab = get_vocab(val_df['less_toxic_1'])
# for i in tqdm([
#     time.sleep(0.3)
# # more_toxic_vocab = get_vocab(val_df['more_toxic_1'])

In [None]:
less_toxic_vocab

### Stemming

In [None]:
# stemmer = PorterStemmer()

# stem_words = val_df['tokens_less_toxic'].apply(lambda x: [stemmer.stem(i) for i in x])
# val_df['stem_words_less_toxic'] = stem_words
# val_df.head()

# stem_words = val_df['tokens_more_toxic'].apply(lambda x: [stemmer.stem(i) for i in x])
# val_df['stem_words_more_toxic'] = stem_words

# stem_words = comment_df['text_tokens'].apply(lambda x: [stemmer.stem(i) for i in x])
# comment_df['text_stem_words'] = stem_words
# val_df.head()

In [None]:
# comment_df.head()

#### Removing Single words from text/comment column

In [None]:
# def rejoin_words(row):
#     my_list = row['stem_words_less_toxic']
#     joined_words = ( " ".join(my_list))
#     return joined_words
# val_df['processed_LT'] = val_df.apply(rejoin_words, axis=1)

# def rejoin_words_1(row):
#     my_list = row['stem_words_more_toxic']
#     joined_words = ( " ".join(my_list))
#     return joined_words
# val_df['processed_MT'] = val_df.apply(rejoin_words_1, axis=1)


# def rejoin_words_2(row):
#     my_list = row['text_stem_words']
#     joined_words = ( " ".join(my_list))
#     return joined_words
# comment_df['processed'] = comment_df.apply(rejoin_words_2, axis=1)

In [None]:
# val_df['processed_LT'] = val_df['processed_LT'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 1]))
# val_df['processed_MT'] = val_df['processed_MT'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 1]))
# comment_df['processed'] = comment_df['processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 1]))

# val_df['processed_LT']

In [None]:
# def word_extraction(sentence):
#     ignore = ['a', "the", "is"]
#     words = re.sub("[^\w]", " ",  sentence).split()
#     cleaned_text = [w.lower() for w in words if w not in ignore]
#     return cleaned_text

In [None]:
# tfidf_vect = TfidfVectorizer(min_df=0.0001, max_features=5000)
# X_tfidf = tfidf_vect.fit_transform(val_df['processed_LT'])
# X_features =  pd.DataFrame(X_tfidf.toarray())
# X_features