## Stratégie regression sur le vecteur donné directement dans le jeu d'entrainement. Puis application des scores sur la prédiction.

In [None]:
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

from gensim.parsing.preprocessing import remove_stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

### Cleaning text

In [None]:

lancaster=LancasterStemmer()

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    text = remove_stopwords(text)
    token_words=word_tokenize(text)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(lancaster.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)
    

### Features weights

In [None]:
FEATURE_WTS = {
    'toxic': 0.32,
    'severe_toxic': 1.5,
    'obscene': 0.16, 
    'threat': 1.5,
    'insult': 0.64,
    'identity_hate': 1.5
}

FEATURES = list(FEATURE_WTS.keys())
FEATURES

### Get train datasets and pick only a sample of non toxic comments

In [None]:
old_train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
old_train['y'] = 0

old_train['y'] = old_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)
    
pos = old_train[old_train.y>0]
neg = old_train[old_train.y==0].sample(len(pos), random_state=201)
old_train = pd.concat([pos, neg])
old_train

### Add the old dataset

In [None]:
def read_old_test(): 
    df_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv')
    df_test_labels = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
    df = pd.merge(df_test, df_test_labels, how='left', on = 'id')
    df = df.replace(-1, np.nan).dropna()
    return df

old_test = read_old_test()
old_test['y'] = 0
#for feat, wt in FEATURE_WTS.items(): 
#    old_test.y += wt * old_test[feat]
#old_test.y = old_test.y / old_test.y.max()
old_test['y'] = old_test.loc[:, 'toxic':'identity_hate'].sum(axis=1)
old_test_pos = old_test[old_test.y>0]

train = pd.concat([old_train, old_test_pos])

### Drop y

In [None]:
train = train.drop('y', axis=1)
train

### Cleaning...

In [None]:
tqdm.pandas()
train.comment_text = train.comment_text.progress_apply(text_cleaning)
train

### Choose a vectorizer (second is for testing)

In [None]:
#vec = TfidfVectorizer(
#        min_df=3, max_df=0.5, 
#        analyzer='char_wb', ngram_range = (3,5), 
#        lowercase=True, max_features=50000,
#    )
#X_train = vec.fit_transform(train['comment_text'])
#y_train = train.loc[:, 'toxic':'identity_hate']

In [None]:
vec = TfidfVectorizer(
        max_df=0.5,
        min_df=3,
        lowercase=True, 
    )
X_train = vec.fit_transform(train['comment_text'])
y_train = train.loc[:, 'toxic':'identity_hate']

In [None]:
#X_train, X_test, y_train, y_test = \
#    sklearn.model_selection.train_test_split(train['comment_text'], train.loc[:, 'toxic':'identity_hate'],
#                                    test_size=0.20,
#                                     random_state=0
#                                    )

#X_train = vec.fit_transform(X_train)
#X_test = vec.transform(X_test)



In [None]:
X_train,y_train

### Building rnn and callbacks

In [None]:
l_r = 0.005

def scheduler(epoch, lr):
    return lr/8

early_stop = tf.keras.callbacks.EarlyStopping(
                    monitor='val_loss', min_delta=0.0001, patience=1, verbose=0,
                    mode='auto', baseline=None, restore_best_weights=True
                )

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

optim = tf.keras.optimizers.Adam(learning_rate=l_r)
my_rnn = Sequential()

my_rnn.add(Dense(1200, activation='relu'))
my_rnn.add(Dense(200, activation='relu'))
my_rnn.add(Dense(36, activation='relu'))
my_rnn.add(Dense(6))
my_rnn.compile(loss="mse", optimizer=optim, metrics=["mse"])

In [None]:
with tf.device('/CPU:0'):
    model_info = my_rnn.fit(X_train.toarray(),
                            y_train, epochs=10,
                            batch_size=30,
                            verbose=1,
                            validation_split=0.2,
                           callbacks=[lr_scheduler,early_stop])

In [None]:
#print("Evaluate on test data")
#with tf.device('/CPU:0'):
#    results = my_rnn.evaluate(X_test.toarray(), y_test, batch_size=50)
#print("test loss, test mse:", results)

In [None]:
with tf.device('/CPU:0'):
    print(my_rnn.predict(X_train.toarray()[0:1]))

### Validation ...

In [None]:
val = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [None]:
val

In [None]:
val['less_toxic'] = val['less_toxic'].progress_apply(text_cleaning)
val['more_toxic'] = val['more_toxic'].progress_apply(text_cleaning)

In [None]:
val

In [None]:
lt_vec = vec.transform(val['less_toxic'])
mt_vec = vec.transform(val['more_toxic'])

In [None]:
with tf.device('/CPU:0'):
    p1 = my_rnn.predict(lt_vec.toarray())
    p2 = my_rnn.predict(mt_vec.toarray())


In [None]:
p1

In [None]:
f = np.array(list(FEATURE_WTS.values()))
f

In [None]:
p2

In [None]:
f1 = np.array([sum(row) for row in f*p1])
f1

In [None]:
f2 = np.array([sum(row) for row in f*p2])
f2

In [None]:
(f1<f2).mean()

## Prepare submission

In [None]:
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
sub.text = sub.text.progress_apply(text_cleaning)
p = my_rnn.predict(vec.transform(sub.text).toarray())
sub['score'] = np.array([sum(row) for row in f*p])
sub

In [None]:
sub[['comment_id', 'score']].to_csv('submission.csv', index=False)