In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.porter import PorterStemmer
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

import re 
import scipy
from scipy import sparse
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
import xgboost

In [None]:
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
comm_score = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
comm_score.head()

In [None]:
comm_score.shape

In [None]:

label_score = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in label_score:
    train[category] = train[category] * label_score[category]

train['score'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

train['y'] = train['score']

min_len = (train['y'] > 0).sum()  # len of toxic comments
df_non_toxic = train[train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
train_new = pd.concat([train[train['y'] > 0], df_non_toxic])  # make new df
train_new.head(2)

In [None]:
train_new.shape

In [None]:
train_new['y'].value_counts()

In [None]:
train.head()

In [None]:
train = train.rename(columns={'comment_text':'comment'})

In [None]:
def text_cleaning(text):
    
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
train['comment'] = train['comment'].progress_apply(text_cleaning)

In [None]:
df = train.copy()

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df['y'].value_counts()

In [None]:
df['y'].value_counts(normalize=True)

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_non_toxic = df[df['y'] == 0].sample(n=min_len * 2, random_state=42)
df = pd.concat([df[df['y'] >= 0.1], df_non_toxic])
df['y'].value_counts()

In [None]:
df.head()

In [None]:
vec = TfidfVectorizer(sublinear_tf=True, min_df= 3, max_df=0.5,max_features=50000)
X = vec.fit_transform(df['comment'])
X

In [None]:
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

In [None]:
#model = xgboost.XGBRegressor()
model = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
model.fit(X, df['y'])

In [None]:
val_data = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
val_data.head()

In [None]:
tqdm.pandas()
val_data['less_toxic'] = val_data['less_toxic'].progress_apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].progress_apply(text_cleaning)

In [None]:
X_less_toxic = vec.transform(val_data['less_toxic'])
X_more_toxic = vec.transform(val_data['more_toxic'])

In [None]:
p_less = model.predict(X_less_toxic)
p_more = model.predict(X_more_toxic)

In [None]:
tqdm.pandas()
comm_score['text'] = comm_score['text'].progress_apply(text_cleaning)

In [None]:
X_test = vec.transform(comm_score['text'])
p_test = model.predict(X_test)

In [None]:
comm_score['score'] = p_test

In [None]:
comm_score['score'].count()

In [None]:
comm_score[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
comm_score.head()