## 0.8+ score by simple TF-Idf and Ridge regression

## Analysis of bad predictions



## Built on top of the amazing notebook here : 
https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768


# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
import scipy
pd.options.display.max_colwidth=300

# Read Jigsaw Toxic Comment Classification DataBase

In [2]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

(159571, 8)


In [3]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)

(30108, 3)


In [4]:
# Give 2X Weight To Toxic Comments, followed by 3X Weight to Severe Toxic Comments
df['severe_toxic'] = df.severe_toxic * 3
df['toxic'] = df.toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)

In [5]:
print(np.mean(df[df['severe_toxic']==3]['y']))
print(np.mean(df[df['toxic']==2]['y']))

7.077115987460815
3.425787890676082


In [6]:
# Remove All Columns Leaving Comment Tex and Y
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

In [7]:
# Reduce Rows with No Toxicity as our key focus area is the reviews with toxicity
df = pd.concat([df[df.y>0] , 
                df[df.y==0].sample(int(len(df[df.y>0])*1.5)) ], axis=0).sample(frac=1)

print(df.shape)

(40562, 2)


In [8]:
df['y'].value_counts()

0    24337
2     5883
4     4003
3     3242
7     1008
5      793
1      694
8      329
6      242
9       31
Name: y, dtype: int64

# Modeling

In [9]:
pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        ("clf", Ridge()),
    ]
)

In [10]:
# Train the pipeline
pipeline.fit(df['text'], df['y'])

Pipeline(steps=[('vect',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3,
                                 ngram_range=(3, 5))),
                ('clf', Ridge())])

# Validate the pipeline 

In [11]:
p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

In [12]:
f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}'

'Validation Accuracy is 68.46'

In [13]:
df_val['p1'] = p1
df_val['p2'] = p2
df_val['diff'] = np.abs(p2 - p1)

df_val['correct'] = (p1 < p2).astype('int')

Experiment : Assing More Toxic Comments Weight to 

In [14]:
df_val_incorrect_preds_l = pd.DataFrame()
df_val_incorrect_preds_l['text'] = df_val[df_val['correct']==0]['less_toxic']
df_val_incorrect_preds_l['y'] = 3.4

df_val_incorrect_preds_m = pd.DataFrame()
df_val_incorrect_preds_m['text'] = df_val[df_val['correct']==0]['more_toxic']
df_val_incorrect_preds_m['y'] = 7.07

In [15]:
df_v2 = pd.concat([df , df_val_incorrect_preds_l, df_val_incorrect_preds_m])

print(df_v2.shape)

(59556, 2)


In [16]:
# Train the pipeline with v2 data
pipeline.fit(df['text'], df['y'])

Pipeline(steps=[('vect',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.5, min_df=3,
                                 ngram_range=(3, 5))),
                ('clf', Ridge())])

In [17]:
p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

In [18]:
f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}'

'Validation Accuracy is 68.46'

# Predict on test data 

In [19]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


In [20]:
# Predict using pipeline

sub_preds = pipeline.predict(df_sub['text'])

df_sub['score'] = sub_preds

## Correct the rank ordering

In [21]:
# Rank the predictions 

df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

print(df_sub['score'].rank().nunique())

7537


In [22]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)