<h2>【0.860】TFIDF_Ridge_simple_baseline</h2>

Data from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)


# Import Library

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge

# Prepare train data

In [None]:
df_train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
# df_test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
# df_test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df_train.head(2)

In [None]:
# Create a score that measure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new.head(2)

In [None]:
df_train = df_train.rename(columns={'comment_text':'text'})

<h3>Text Cleaning</h3>

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)

In [None]:
df = df_train.copy()

In [None]:
df['y'].value_counts()

# Undersampling

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

# TF-IDF

In [None]:
# vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
# vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,6))
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (2,5),max_features=46000)
X = vec.fit_transform(df['text'])
X

<h1>Fit Ridge</h1>

In [None]:
model = Ridge(alpha=0.5)
# model = Ridge(alpha=0.485)
model.fit(X, df['y'])

# Prepare validation data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
df_val.head()

<h2>Text cleaning</h2>

In [None]:
# tqdm.pandas()
# df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
# df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

In [None]:
# X_less_toxic = vec.transform(df_val['less_toxic'])
# X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
# p1 = model.predict(X_less_toxic)
# p2 = model.predict(X_more_toxic)

In [None]:
# # Validation Accuracy
# (p1 < p2).mean()

# Prepare submission data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

<h2>Prediction</h2>

In [None]:
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)

In [None]:
df_sub['score'] = p3

In [None]:
df_sub['score'].count()

In [None]:
df_sub['score'] = df_sub['score'] 

In [None]:
# 9 comments will fail if compared one with the other
df_sub['score'].nunique()

<h2>Prepare submission file</h2>

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
df_sub