In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import rankdata

import re 
import scipy
from scipy import sparse
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.svm import SVR

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Read the Dataset
Note: Here i have used one additional dataset from Kaggle 'jigsaw-toxic-comment-classification' 

In [None]:
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
# df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
comm_score = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
train[:2]

In [None]:
train.shape

In [None]:
comm_score[:2]

In [None]:
comm_score.shape

Create a score that measure how much toxic is a comment

In [None]:

label_score = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in label_score:
    train[category] = train[category] * label_score[category]

train['score'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

train['y'] = train['score']

min_len = (train['y'] > 0).sum()  # len of toxic comments
df_non_toxic = train[train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
train_new = pd.concat([train[train['y'] > 0], df_non_toxic])  # make new df
train_new.head(2)

In [None]:
train_new.shape

In [None]:
train_new['y'].value_counts()

In [None]:
train[:10]

Text Cleaning...

In [None]:
train = train.rename(columns={'comment_text':'comment'})

In [None]:
def text_cleaning(text):
   
    
   
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
train['comment'] = train['comment'].progress_apply(text_cleaning)

In [None]:
df = train.copy()

In [None]:
df[:5]

In [None]:
df['y'].value_counts()

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_non_toxic = df[df['y'] == 0].sample(n=min_len * 2, random_state=402)
df = pd.concat([df[df['y'] >= 0.1], df_non_toxic])
df['y'].value_counts()

TFIDF

In [None]:
# vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (2,5),max_features=46000)
X = vec.fit_transform(df['comment'])
X


Ridge Ensemble Modelling 

1st Model

In [None]:
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

2

In [None]:
model_1 = Ridge(alpha=1.)
model_1.fit(X, df['y'])

3

In [None]:
model_2 = Ridge(alpha=3.)
model_2.fit(X, df['y'])

Validation_data

In [None]:
val_data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
val_data[:2]

In [None]:
tqdm.pandas()
val_data['less_toxic'] = val_data['less_toxic'].progress_apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].progress_apply(text_cleaning)

In [None]:
X_less_toxic = vec.transform(val_data['less_toxic'])
X_more_toxic = vec.transform(val_data['more_toxic'])

In [None]:
p_less = model.predict(X_less_toxic)
p_more = model.predict(X_more_toxic)

In [None]:
# Validation Accuracy
(p_less < p_more).mean()

In [None]:
p_less_1 = model_1.predict(X_less_toxic)
p_more_1 = model_1.predict(X_more_toxic)
# Validation Accuracy
(p_less_1 < p_more_1).mean()

In [None]:
p_less_2 = model_2.predict(X_less_toxic)
p_more_2 = model_2.predict(X_more_toxic)
# Validation Accuracy
(p_less_2 < p_more_2).mean()

In [None]:
tqdm.pandas()
comm_score['text'] = comm_score['text'].progress_apply(text_cleaning)

In [None]:
X_test = vec.transform(comm_score['text'])
p_1 = model.predict(X_test)
p_2 = model_1.predict(X_test)
p_3 = model_2.predict(X_test)

In [None]:

comm_score['score1']=rankdata( p_1, method='ordinal') 
comm_score["score2"] = rankdata(p_2, method='ordinal')

comm_score['score3']=rankdata(p_3, method='ordinal')
comm_score['score']=comm_score['score3'] + comm_score['score1'] + comm_score['score2']
# comm_score['score']=rankdata(comm_score['score'] , method='ordinal')

In [None]:
# comm_score['score'] = p_1

In [None]:
# comm_score['score'] = (p_1 + p_2 + p_3) / 3.

In [None]:
comm_score["score"] = rankdata( comm_score["score"], method='ordinal')

In [None]:
comm_score['score'].count()

In [None]:
comm_score[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
comm_score[:3]