In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import re 
from scipy import sparse
import time
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.linear_model import Ridge


# Loading Data 

In [None]:
df = pd.read_csv('../input/jigsaw-regression-based-data/train_data_version2.csv')
df.head()

<h3>Text Cleaning</h3>

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

# TF-IDF

In [None]:
df = df.dropna(axis = 0)

In [None]:
df.info()

In [None]:
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5), max_features = 46000)
vec.fit(df['text'])

In [None]:
from gensim.models import KeyedVectors, FastText

fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

# Preparing Data For training 

We used 46000 Features of TFIDF Word Representations, 256 Dimensional FastText Word Embeddings (Trained on Toxic Comments) and then concatenate them to make final dataset for Training a model 

In [None]:
from scipy.sparse import hstack

def splitter(text):
    tokens = []
    
    for word in text.split(' '):
        tokens.append(word)
    
    return tokens

def vectorizer(text):
    tokens = splitter(text)
    
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1
    del x2 
    
    return x   

In below code cell we are creating data which we will feed in model

In [None]:
X_list = []

for text in df.text:
    X_list.append(vectorizer(text))

In [None]:
!pip install -U sentence-transformers

In [None]:
# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
# sentences = ['This framework generates embeddings for each input sentence',
#     'Sentences are passed as a list of string.', 
#     'The quick brown fox jumps over the lazy dog.']



sentences = df['text'].values
# print(len(ssentences))

#Sentences are encoded by calling model.encode()
# embeddings = model.encode(sentences)

# #Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")
print(embeddings.shape)

In [None]:
EMB_DIM = len(vec.vocabulary_) + 256

In [None]:
X_np = np.array(X_list).reshape(-1, EMB_DIM)

In [None]:
from scipy import sparse

X = sparse.csr_matrix(X_np)
del X_np

In [None]:
fast_X = X.todense()
fast_X.shape

In [None]:
final = np.concatenate((fast_X, embeddings), axis=1)
final.shape

<h1>Fit Ridge</h1>

In [None]:
%%time
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

In [None]:
%%time
l_model = Ridge(alpha=1.)
l_model.fit(X, df['y'])

In [None]:
%%time
d_model = Ridge(alpha=1.5)
d_model.fit(X, df['y'])

In [None]:
%%time
s_model = Ridge(alpha=2.)
s_model.fit(X, df['y'])

# Prepare validation data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
df_val.head()

<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

In [None]:
X_less_toxic_temp = []

for text in df_val.less_toxic:
    X_less_toxic_temp.append(vectorizer(text))

X_less_toxic_temp = np.array(X_less_toxic_temp).reshape(-1, EMB_DIM)
X_less_toxic = sparse.csr_matrix(X_less_toxic_temp)

del X_less_toxic_temp
    
X_more_toxic_temp = []

for text in df_val.more_toxic:
    X_more_toxic_temp.append(vectorizer(text))
    
X_more_toxic_temp = np.array(X_more_toxic_temp).reshape(-1, EMB_DIM)
X_more_toxic = sparse.csr_matrix(X_more_toxic_temp)

del X_more_toxic_temp

In [None]:
p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

In [None]:
# Validation Accuracy
(p1 < p2).mean()

In [None]:
p1 = l_model.predict(X_less_toxic)
p2 = l_model.predict(X_more_toxic)
# Validation Accuracy
(p1 < p2).mean()

In [None]:
p1 = d_model.predict(X_less_toxic)
p2 = d_model.predict(X_more_toxic)
# Validation Accuracy
(p1 < p2).mean()

In [None]:
p1 = s_model.predict(X_less_toxic)
p2 = s_model.predict(X_more_toxic)
# Validation Accuracy
(p1 < p2).mean()

# Prepare submission data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

Preparing Submission data

In [None]:
X_sub_temp = []
for text in df_sub.text:
    X_sub_temp.append(vectorizer(text))
    
X_sub_temp = np.array(X_sub_temp).reshape(-1, EMB_DIM)
X_test = sparse.csr_matrix(X_sub_temp)

del X_sub_temp

## Prediction

In [None]:
p3 = model.predict(X_test)

In [None]:
p4 = l_model.predict(X_test)
p5 = s_model.predict(X_test)
p6 = d_model.predict(X_test)

Ensembling the Ridge Regression models


In [None]:
df_sub['score'] = (p3 + p4 + p5 + p6) / 4.

In [None]:
df_sub['score'].count()

In [None]:
df_sub['score'] = df_sub['score'] 

In [None]:
df_sub['score'].nunique()

<h2>Prepare submission file</h2>

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)