# Import Dependencies

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
import scipy
pd.options.display.max_colwidth=300

# Read Jigsaw Toxic Comment Classification DataBase

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
print(df_sub.shape)

In [None]:
# df_temp = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

# # Give 2X Weight To Toxic Comments, followed by 3X Weight to Severe Toxic Comments
# df_temp['severe_toxic'] = df_temp.severe_toxic * 1.5
# df_temp['insult'] = df_temp.insult * 0.1
# df_temp['identity_hate'] = df_temp.identity_hate * 1.5
# df_temp['toxic'] = df_temp.toxic * 1
# df_temp['threat'] = df_temp.threat * 1.5
# df_temp['obscene'] = df_temp.obscene * 0.16

# df_temp['y'] = df_temp[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].mean(axis=1)

# print("severe_toxic",np.mean(df_temp[df_temp['severe_toxic']==1.5]['y']))
# print("toxic",np.mean(df_temp[df_temp['toxic']==1]['y']))
# print("insult",np.mean(df_temp[df_temp['insult']==0.1]['y']))
# print("identity_hate",np.mean(df_temp[df_temp['identity_hate']==1.5]['y']))
# print("threat",np.mean(df_temp[df_temp['threat']==1.5]['y']))
# print("obscene",np.mean(df_temp[df_temp['obscene']==0.16]['y']))

In [None]:
# Give 2X Weight To Toxic Comments, followed by 3X Weight to Severe Toxic Comments
df['severe_toxic'] = df.severe_toxic * 1.5
df['insult'] = df.insult * 0.1
df['identity_hate'] = df.identity_hate * 1.5
df['toxic'] = df.toxic * 1
df['threat'] = df.threat * 1.5
df['obscene'] = df.obscene * 0.16

df['y'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].mean(axis=1)

In [None]:
print("severe_toxic",np.mean(df[df['severe_toxic']==1.5]['y']))
print("toxic",np.mean(df[df['toxic']==1]['y']))
print("insult",np.mean(df[df['insult']==0.1]['y']))
print("identity_hate",np.mean(df[df['identity_hate']==1.5]['y']))
print("threat",np.mean(df[df['threat']==1.5]['y']))
print("obscene",np.mean(df[df['obscene']==0.16]['y']))

In [None]:
# Remove All Columns Leaving Comment Tex and Y
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [None]:
# Reduce Rows with No Toxicity as our key focus area is the reviews with toxicity
df = pd.concat([df[df.y>0] , 
                df[df.y==0].sample(int(len(df[df.y>0])*1.5)) ], axis=0).sample(frac=1)

print(df.shape)

In [None]:
df['y'].describe()

# Modeling

In [None]:
pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        ("clf", Ridge()),
    ]
)

In [None]:
# Train the pipeline
pipeline.fit(df['text'], df['y'])

# Validate the pipeline 

In [None]:
p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

In [None]:
# Prev Version: 65.82
f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}'

Experiment : Assing More Toxic Comments Weight to 

In [None]:
# df_val['p1'] = p1
# df_val['p2'] = p2
# df_val['diff'] = np.abs(p2 - p1)

# df_val['correct'] = (p1 < p2).astype('int')

# df_val_incorrect_preds_l = pd.DataFrame()
# df_val_incorrect_preds_l['text'] = df_val[df_val['correct']==0]['less_toxic']
# df_val_incorrect_preds_l['y'] = 0.2

# df_val_incorrect_preds_m = pd.DataFrame()
# df_val_incorrect_preds_m['text'] = df_val[df_val['correct']==0]['more_toxic']
# df_val_incorrect_preds_m['y'] = 0.7

# df_v2 = pd.concat([df , df_val_incorrect_preds_l, df_val_incorrect_preds_m])

# print(df_v2.shape)

# Train the pipeline with v2 data
# pipeline.fit(df['text'], df['y'])

# p1 = pipeline.predict(df_val['less_toxic'])
# p2 = pipeline.predict(df_val['more_toxic'])

# f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}'

# Predict on test data 

In [None]:
# Predict using pipeline

sub_preds = pipeline.predict(df_sub['text'])

df_sub['score'] = sub_preds

## Correct the rank ordering

In [None]:
# Rank the predictions 

df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

print(df_sub['score'].rank().nunique())

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)