In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re 
from scipy import sparse
import time
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns=100

from sklearn.linear_model import Ridge

import pickle as pkl

# Loading Data 

In [None]:
df = pd.read_csv('../input/jigsaw-regression-based-data/train_data_version2.csv')
df.head()

<h3>Text Cleaning</h3>

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

# TF-IDF

In [None]:
df = df.dropna(axis = 0)

In [None]:
df.info()

In [None]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5), max_features = 46000)
vec.fit(df.text)

In [None]:
from gensim.models import KeyedVectors, FastText

fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

# Preparing Data For training 

We used 46000 Features of TFIDF Word Representations, 256 Dimensional FastText Word Embeddings (Trained on Toxic Comments) and then concatenate them to make final dataset for Training a model 

In [None]:
def splitter(text):
    tokens = []
    
    for word in text.split(' '):
        tokens.append(word)
    
    return tokens

def vectorizer(text):
    tokens = splitter(text)
    
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1
    del x2 
    
    return x   

In below code cell we are creating data which we will feed in model

In [None]:
X_list = []

for text in tqdm(df.text):
    X_list.append(vectorizer(text))

In [None]:
EMB_DIM = len(vec.vocabulary_) + 256
EMB_DIM

In [None]:
X_np = np.array(X_list).reshape(-1, EMB_DIM)
X_np.shape

In [None]:
X = sparse.csr_matrix(X_np)
del X_np

In [None]:
X

<h1>Fit Ridge</h1>

In [None]:
# %%time
# model = Ridge(alpha=0.5)
# model.fit(X, df['y'])

In [None]:
# %%time
# l_model = Ridge(alpha=1.)
# l_model.fit(X, df['y'])

In [None]:
# %%time
# d_model = Ridge(alpha=1.5)
# d_model.fit(X, df['y'])

In [None]:
# %%time
# s_model = Ridge(alpha=2.)
# s_model.fit(X, df['y'])

In [None]:
# %%time
# a_model = Ridge(alpha=0.1)
# a_model.fit(X, df['y'])

In [None]:
# %%time
# b_model = Ridge(alpha=3.)
# b_model.fit(X, df['y'])

### Pickle models

In [None]:
# with open('m_05.pkl', 'wb') as f:
#     pkl.dump(model, f)

# with open('m_1.pkl', 'wb') as f:
#     pkl.dump(l_model, f)

# with open('m_15.pkl', 'wb') as f:
#     pkl.dump(d_model, f)

# with open('m_2.pkl', 'wb') as f:
#     pkl.dump(s_model, f)
    
# with open('m_01.pkl', 'wb') as f:
#     pkl.dump(a_model, f)
    
# with open('m_3.pkl', 'wb') as f:
#     pkl.dump(b_model, f)

In [None]:
with open('../input/ridge-models/m_05.pkl', 'rb') as f:
    model = pkl.load(f)

with open('../input/ridge-models/m_1.pkl', 'rb') as f:
    l_model = pkl.load(f)

with open('../input/ridge-models/m_15.pkl', 'rb') as f:
    d_model = pkl.load(f)

with open('../input/ridge-models/m_2.pkl', 'rb') as f:
    s_model = pkl.load(f)
    
with open('../input/ridge-models/m_01.pkl', 'rb') as f:
    a_model = pkl.load(f)

with open('../input/ridge-models/m_3.pkl', 'rb') as f:
    b_model = pkl.load(f)

# Prepare validation data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
df_val.head()

<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

In [None]:
X_less_toxic_temp = []

for text in tqdm(df_val.less_toxic):
    X_less_toxic_temp.append(vectorizer(text))

X_less_toxic_temp = np.array(X_less_toxic_temp).reshape(-1, EMB_DIM)
X_less_toxic = sparse.csr_matrix(X_less_toxic_temp)

del X_less_toxic_temp
    
X_more_toxic_temp = []

for text in tqdm(df_val.more_toxic):
    X_more_toxic_temp.append(vectorizer(text))
    
X_more_toxic_temp = np.array(X_more_toxic_temp).reshape(-1, EMB_DIM)
X_more_toxic = sparse.csr_matrix(X_more_toxic_temp)

del X_more_toxic_temp

In [None]:
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [None]:
# Validation Accuracy
(p1 < p2).mean()

In [None]:
p1l = l_model.predict(X_less_toxic)
p2l = l_model.predict(X_more_toxic)
# Validation Accuracy
(p1l < p2l).mean()

In [None]:
p1d = d_model.predict(X_less_toxic)
p2d = d_model.predict(X_more_toxic)
# Validation Accuracy
(p1d < p2d).mean()

In [None]:
p1s = s_model.predict(X_less_toxic)
p2s = s_model.predict(X_more_toxic)
# Validation Accuracy
(p1s < p2s).mean()

In [None]:
p1a = a_model.predict(X_less_toxic)
p2a = a_model.predict(X_more_toxic)
# Validation Accuracy
(p1a < p2a).mean()

In [None]:
p1b = b_model.predict(X_less_toxic)
p2b = b_model.predict(X_more_toxic)
# Validation Accuracy
(p1b < p2b).mean()

In [None]:
comb1 = p1l + p1d + p1s + p1b
comb2 = p2l + p2d + p2s + p2b

(comb1 < comb2).mean()

### Intersection investigation

In [None]:
# wrong = set(df_val[p1 > p2].index)
# wrong_l = set(df_val[p1l > p2l].index)
# wrong_d = set(df_val[p1d > p2d].index)
# wrong_s = set(df_val[p1s > p2s].index)

In [None]:
# len(wrong), len(wrong_l), len(wrong_d), len(wrong_s)

In [None]:
# intersection_table = pd.DataFrame(index=["model", "model_l", "model_d", "model_s"],
#                                   columns=["model", "model_l", "model_d", "model_s"])

# intersection_table["model"]["model"] = len(wrong.intersection(wrong))
# intersection_table["model"]["model_l"] = len(wrong.intersection(wrong_l))
# intersection_table["model"]["model_d"] = len(wrong.intersection(wrong_d))
# intersection_table["model"]["model_s"] = len(wrong.intersection(wrong_s))

# intersection_table["model_l"]["model"] = len(wrong_l.intersection(wrong))
# intersection_table["model_l"]["model_l"] = len(wrong_l.intersection(wrong_l))
# intersection_table["model_l"]["model_d"] = len(wrong_l.intersection(wrong_d))
# intersection_table["model_l"]["model_s"] = len(wrong_l.intersection(wrong_s))

# intersection_table["model_d"]["model"] = len(wrong_d.intersection(wrong))
# intersection_table["model_d"]["model_l"] = len(wrong_d.intersection(wrong_l))
# intersection_table["model_d"]["model_d"] = len(wrong_d.intersection(wrong_d))
# intersection_table["model_d"]["model_s"] = len(wrong_d.intersection(wrong_s))

# intersection_table["model_s"]["model"] = len(wrong_s.intersection(wrong))
# intersection_table["model_s"]["model_l"] = len(wrong_s.intersection(wrong_l))
# intersection_table["model_s"]["model_d"] = len(wrong_s.intersection(wrong_d))
# intersection_table["model_s"]["model_s"] = len(wrong_s.intersection(wrong_s))

# intersection_table

In [None]:
# ! pip install matplotlib-venn

In [None]:
# import matplotlib.pyplot as plt
# from matplotlib_venn import venn3

# plt.figure(figsize=(20, 10))
# venn3([wrong, wrong_d, wrong_s], ("model", "model_l", "model_s"))

# Prepare submission data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

Preparing Submission data

In [None]:
X_sub_temp = []
for text in df_sub.text:
    X_sub_temp.append(vectorizer(text))
    
X_sub_temp = np.array(X_sub_temp).reshape(-1, EMB_DIM)
X_test = sparse.csr_matrix(X_sub_temp)

del X_sub_temp

In [None]:
pt = model.predict(X_test)

In [None]:
ptl = l_model.predict(X_test)
ptd = d_model.predict(X_test)
pts = s_model.predict(X_test)
ptb = b_model.predict(X_test)

Ensembling the Ridge Regression models


In [None]:
df_sub['score'] = ptl + ptd + pts + ptb

In [None]:
df_sub['score'].count()

In [None]:
df_sub['score'].nunique()

<h2>Prepare submission file</h2>

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)