## 0.816 score by single TF-Idf and Ridge regression on __CLEANED__ data

### Some cleaning patterns shown here - 
https://www.kaggle.com/samarthagarwal23/y-patterns-in-nlp-data


#### Built on top of the amazing notebook here : 
- https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768


# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.pipeline import Pipeline
import scipy
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_colwidth=300

# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df['y'].value_counts()

## Reduce the rows with 0 toxicity 

In [None]:
df = pd.concat([df[df.y>0] , 
                df[df.y==0].sample(int(len(df[df.y>0])*1.5)) ], axis=0).sample(frac=1)

print(df.shape)

In [None]:
df['y'].value_counts()

In [None]:
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters more than 3 times to length of 3
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    # Add space around repeating characters
    data[col] = data[col].str.replace(r'([*!?\']+)',r' \1 ')    
    # patterns with repeating characters 
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b',r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B',r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}',' ').str.strip()   
    
    return data

# Create Sklearn Pipeline with 
## TFIDF - Take 'char_wb' as analyzer to capture subwords well
## Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

In [None]:
pipeline = Pipeline(
    [
        ("vect", TfidfVectorizer(min_df= 3, 
                                 max_df=0.5, 
                                 lowercase=False,
                                 analyzer = 'char_wb', 
                                 ngram_range = (3,5))),
        ("clf", Ridge()),
    ]
)

In [None]:
# Train the pipeline
df = clean(df, 'text')
pipeline.fit(df['text'], df['y'])

# Validate the pipeline 

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
df_val = clean(df_val, 'less_toxic')
df_val = clean(df_val, 'more_toxic')

p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

In [None]:
f'Validation Accuracy is { np.round((p1 < p2).mean() * 100,2)}'

# Analyze bad predictions

 
### Incorrect predictions with similar scores
### Incorrect predictions with different scores

In [None]:
df_val['p1'] = p1
df_val['p2'] = p2
df_val['diff'] = np.abs(p2 - p1)

df_val['correct'] = (p1 < p2).astype('int')


In [None]:
df_val[df_val.correct == 0]['diff'].hist(bins=100)

In [None]:
# vect_an = pipeline['vect'].build_analyzer()
# vocab = pipeline['vect'].vocabulary_
# [v for v in vect_an(df_val.more_toxic[5247]) if (v not in vocab) & (v.strip() not in pipeline['vect'].stop_words_)]

In [None]:

### Incorrect predictions with similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=True).head(20)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores


df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

# Predict on test data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub = clean(df_sub, 'text')


In [None]:
# Predict using pipeline

sub_preds = pipeline.predict(df_sub['text'])

df_sub['score'] = sub_preds

## Correct the rank ordering

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
df_sub['score'].value_counts().reset_index()[:10]

In [None]:
df_sub['score'].rank().nunique()

In [None]:
# Rank the predictions 

df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

print(df_sub['score'].rank().nunique())

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)