# Importing libs

In [None]:
# Basic Pydata Libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt     
import seaborn as sns
import html
import unicodedata

# for reproducibility , to get the same results when evry your run
np.random.seed(2021) 


# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC, SVC


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ML
import tensorflow as tf
import keras.backend as K

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Concatenate
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers.merge import concatenate



#string
import string
import re

#nlp
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk import ngrams


stop_words = set(stopwords.words("english"))

# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer   
from wordcloud import WordCloud, STOPWORDS

## warnings
import warnings
warnings.filterwarnings("ignore")

# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

# Importing data

In [None]:
PATH = '/kaggle/input/jigsaw-toxic-severity-rating/'
train = pd.read_csv(PATH + 'validation_data.csv')
test = pd.read_csv(PATH + 'comments_to_score.csv')
sub = pd.read_csv(PATH + 'sample_submission.csv')

In [None]:
train.head()

In [None]:
rows = train.shape[0]
rows

In [None]:
train.less_toxic[0], train.more_toxic[0]

In [None]:
train.less_toxic[rows-1], train.more_toxic[rows-1]

# Data Preprocessing

In [None]:
def remove_special_chars(text):
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    text = re.sub(sequencePattern, seqReplacePattern, text)         # Replace 3 or more consecutive letters by 2 letter.
    text = re.sub('<.*?>+', '', text)                               # remove tages
    return text

def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def to_lowercase(text):
    return text.lower()



def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+', '', text)


def remove_whitespaces(text):
    return text.strip()


def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]


def stem_words(words):
    """Stem words in text"""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in words]

def lemmatize_words(words):
    """Lemmatize words in text, and by defult lemmatize nouns"""

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def lemmatize_verbs(words):
    """Lemmatize verbs in text"""

    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in words])

def text2words(text):
    return word_tokenize(text)

def normalize_text( text):
    text  = remove_special_chars(text)
    text  = remove_non_ascii(text)
    text  = remove_punctuation(text)
    text  = to_lowercase(text)
    text  = replace_numbers(text)
    words = text2words(text)
    words = remove_stopwords(words, stop_words)
    #words = stem_words(words)# Either stem ovocar lemmatize
    words = lemmatize_words(words)
    words = lemmatize_verbs(words)

    return ''.join(words)


In [None]:
# Make a clean text for  less_toxic
cleaned_lees_toxic = [normalize_text(sent) for sent in train['less_toxic']]
train['cleaned_lees_toxic'] = cleaned_lees_toxic
train.head()

In [None]:
# Make a clean text for  more_toxic
cleaned_more_toxic = [normalize_text(sent) for sent in train['more_toxic']]
train['cleaned_more_toxic'] = cleaned_more_toxic
train.head()

In [None]:
train.cleaned_lees_toxic[0], train.cleaned_more_toxic[0]

In [None]:
train.cleaned_lees_toxic[rows-1], train.cleaned_more_toxic[rows-1]

## Feature Engineering

**Here we will make two columns target "score" for each toxic level, and we indicate each one with `0 for less_toxic` and `1 for more_toxic`.**

In [None]:
less_toxic_score = [0] * rows
more_toxic_score = [1] * rows
train['less_toxic_score'] = less_toxic_score
train['more_toxic_score'] = more_toxic_score
# drop the original toxic data
train.drop(['less_toxic', 'more_toxic'], inplace= True, axis = 1)
train.head()

**We will make a `toxic_data` column for cleaned_less_toxic and cleaned_more_toxic and make also a `target`  column which is indicate for each toxic score.**

**Shuffel the `toxic_data` and `target` with the same random state for makes better in modeling.**

In [None]:
toxic_data = cleaned_lees_toxic + cleaned_more_toxic
target = less_toxic_score + more_toxic_score

In [None]:
toxic_data[:5], target[:5]

In [None]:
toxic_data[-5:], target[-5:]

**Shuffel the data**

In [None]:
import random

a = ['a', 'b', 'c']
b = [1, 2, 3]

c = list(zip(a, b))
print(c)

random.shuffle(c)

a, b = zip(*c)

print(a)
print(b)


In [None]:
shuffled_data = list(zip(toxic_data, target))
random.shuffle(shuffled_data)
toxic_data, target = zip(*shuffled_data)

In [None]:
toxic_data[:5], target[:5]

In [None]:
toxic_data[-5:], target[-5:]

**Makes a new DataFrame for shuffedled data**

In [None]:
df = pd.DataFrame({'toxic_text': toxic_data,
                  'target': target})
df.head()

# EDA

In [None]:
df.target.value_counts()

In [None]:
sns.countplot(data = df, x= 'target');

## Most frequent words


In [None]:
from collections import Counter

In [None]:
def freq_words(text,score, num):
    '''
        take the whole data, and return data which is have # of words in each sentiment has been passed
    '''
    words = [word for sent in text['toxic_text'][text['target'] == float(score)] for word in sent.split()]    
    freq_words = Counter(words)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:num], columns=['word', 'counts'])
    return freq_words_df

def plot_freq(data, st, num):
    '''
        take the data, and st refeere to kind of sentiment
    '''
    plt.figure(figsize=(12, 6))
    sns.barplot(data= data , x= 'counts', y= 'word')
    plt.title(f'Top {num} words in {st}')
    plt.show();


### Frequent words for each `less toxic data`

In [None]:
num = 30
less_toxic_df = freq_words(df, 0, num)
less_toxic_df.T

In [None]:
plot_freq(less_toxic_df, 'less toxic', num)

### Frequent words for each `more toxic data`

In [None]:
more_toxic_df = freq_words(df, 1, num)
more_toxic_df.T

In [None]:
plot_freq(more_toxic_df, 'more toxic', num)

## Distribution of top n-grams

In [None]:
def get_top_n_gram(corpus, score,  n_gram, top_n=None):
    # list of splited senteces, which is just list of words
    text = [word for sent in corpus['toxic_text'][corpus['target'] == float(score)] for word in sent.split()]    

    grams = ngrams(text, n_gram)
    grams = (' '.join(g) for g in grams)
    num_of_grams = [words for words in grams]
    freq_words = Counter(num_of_grams)
    freq_words_sorted = sorted(freq_words.items(), key=lambda pair: pair[1], reverse=True)
    freq_words_df = pd.DataFrame(freq_words_sorted[:top_n], columns=['word', 'counts'])
    return freq_words_df[:top_n]


### Two-grams for less toxic data


In [None]:
less_toxic_2_gram_df = get_top_n_gram(df, 0, 2, num)
less_toxic_2_gram_df.T

In [None]:
plot_freq(less_toxic_2_gram_df, 'less toxic', num)

### Two-grams for more toxic data


In [None]:
more_toxic_2_gram_df = get_top_n_gram(df, 1, 2, num)
more_toxic_2_gram_df.T

In [None]:
plot_freq(less_toxic_2_gram_df, 'more toxic', num)

### Explanation:

**The data tells when we useing uni-gram, there's a huge different in words for each less and more toxic data, but when using bi-grams there's no different there!!!**

**So in our TF-IDF we will pass the i for `ngram_range` in `TfidfVectorizer`**



## Word Cloud

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[30, 15])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(cleaned_lees_toxic))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Less Toxic',fontsize=40);

wordcloud2 = WordCloud( background_color='black',
                        width=600,
                        height=400).generate(" ".join(cleaned_more_toxic))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('More Toxic',fontsize=40);

# Modeling

In [None]:
df.head()

In [None]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['toxic_text'], df['target'], test_size = 0.2,  random_state=42)
len(X_train), len(y_train), len(X_test), len(y_test)


**TF-IDF Vectorizer**

In [None]:
tf_idf = TfidfVectorizer(analyzer= 'word', max_features= 10000, ngram_range= (1, 1))
X_train = tf_idf.fit_transform(X_train)
X_test = tf_idf.transform(X_test)
X_train.shape, X_train.shape

**Evaluate Model Function:**

In [None]:
def model_Evaluate(model):
    
    # Predict values for Test dataset
    y_pred = model.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)

    # Print the evaluation metrics for the dataset.
    print(classification_report(y_test, y_pred))
    
    # Compute and plot the Confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred)

    categories  = ['Non Toxic','Toxic']
    group_names = ['True Neg','False Pos', 'False Neg','True Pos']
    group_percentages = ['{0:.2%}'.format(value) for value in cf_matrix.flatten() / np.sum(cf_matrix)]

    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    sns.heatmap(cf_matrix, annot = labels, cmap = 'Blues',fmt = '',
                xticklabels = categories, yticklabels = categories)

    plt.xlabel("Predicted values", fontdict = {'size':14}, labelpad = 10)
    plt.ylabel("Actual values"   , fontdict = {'size':14}, labelpad = 10)
    plt.title ("Confusion Matrix", fontdict = {'size':18}, pad = 20)
    return acc


## 1- Logistic Regression

In [None]:
%%time
lr = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
lr.fit(X_train, y_train)
lr_acc = model_Evaluate(lr)

## 2- LinearSVC

In [None]:
%%time
SVCmodel = LinearSVC(C= 1, loss = 'hinge')
SVCmodel.fit(X_train, y_train)
SVC_acc = model_Evaluate(SVCmodel)

# Submission

In [None]:
test.head()

In [None]:
X_test_sub = tf_idf.transform(test['text'])
preds = lr.predict_proba(X_test_sub)[:,1]

In [None]:
preds = [int(p >= 0.5) for p in preds]
test['score'] = preds

In [None]:
submission = test
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

## 3- SVC
It takes a long time for get good results!

In [None]:
%%time
# SVCmodel = LinearSVC(C= 1, loss = 'hinge')
SVM = SVC(kernel = 'linear', gamma = 'auto', probability = True)
SVM.fit(X_train, y_train)
SVM_acc = model_Evaluate(SVM)