In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 25 11:37:52 2021

@https://www.kaggle.com/toru59er/0-86-tfidf-ridge-simple-baseline?scriptVersionId=82701355
"""

import pandas as pd
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

import re 
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.linear_model import Ridge

df_train = pd.read_csv("../input/train-csv-for-use/train.csv")
# df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
# df_test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

df_train.head(2)

# Create a score that measure how much toxic is a comment
cat_mtpl = {'obscene': 18, 'toxic': 41, 'threat': 121, 'insult': 83, 'severe_toxic': 129, 'identity_hate': 157}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new.head(2)

df_train = df_train.rename(columns={'comment_text':'text'})

def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)

df = df_train.copy()
df['y'].value_counts()

##Undersampling
df['y'].value_counts(normalize=True)

min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

##TF-IDF
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
X = vec.fit_transform(df['text'])
X

##Fit Ridge
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

##Prepare validation data
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_val.head()

tqdm.pandas()
df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

X_less_toxic = vec.transform(df_val['less_toxic'])
X_more_toxic = vec.transform(df_val['more_toxic'])

p1 = model.predict(X_less_toxic)
p2 = model.predict(X_more_toxic)

# Validation Accuracy
(p1 < p2).mean()

##Prepare submission data
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

##Prediction
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)

df_sub['score'] = p3
df_sub['score'].count()
df_sub['score'] = df_sub['score']
# 9 comments will fail if compared one with the other
df_sub['score'].nunique()

##Prepare submission file
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)