# 0.81+ score by simple TF-Idf and Ridge regression

## Built as ensemble of 2 models using data from past 2 Jigsaw competitions 

### Analysis of bad predictions for additional insights


#### Built on top of the amazing notebook here : 
https://www.kaggle.com/julian3833/jigsaw-incredibly-simple-naive-bayes-0-768


# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge,RidgeCV, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, MaxAbsScaler , FunctionTransformer
import scipy
import gc
import re
pd.options.display.max_colwidth=300
pd.options.display.min_rows=300


# Training data 

## Convert the label to SUM of all toxic labels (This might help with maintaining toxicity order of comments)

In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 3
df['threat'] = df.threat * 2

df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
df.sample(5)

In [None]:
df['y'].value_counts()

## Reduce the rows with 0 toxicity 

In [None]:
df = pd.concat([df[df.y>0] , 
                df[df.y==0].sample(int(len(df[df.y>0])*1.5)) ], axis=0).sample(frac=1)

print(df.shape)

In [None]:
df['y'].value_counts()

## Text cleaning 

In [None]:
def clean(data, col):

    # Clean some punctutations
    data[col] = data[col].str.replace('\n', ' ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3')
    # Replace repeating characters
    data[col] = data[col].str.replace(r'(")\1+',r'\1')    
    data[col] = data[col].str.replace(r'([*!?\'])\1\1+\B',r'\1\1')    
    data[col] = data[col].str.replace(r'(\w)\1\1+\B',r'\1\1')    
    data[col] = data[col].str.replace(r'(\w)\1+\b',r'\1').str.strip()
    
    return data


In [None]:
#df.text.str.extractall(r'([a-zA-Z]+[/!?.][a-zA-Z]+)')
#df.text.str.extractall(r'([^\w ]{3,})')[[0]].value_counts()#.reset_index()#[:20]

In [None]:
print(re.sub(r'(")\1+',r'\1', 'gooo"""""od """" "" " brooo goodoo'))

print(re.sub(r'(\w)\1+\b',r'\1', 'gooood brooo goodoo'))

print(re.sub(r'([*!?\'])\1+\B',r'\1\1', "g*******d  g**d br!!!!! g''''oodoo"))

print(re.sub(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)',r'\1 \2 \3', 'gooood/brooo 4.3 df'))

# Create Sklearn Pipeline with 
## TFIDF - Take 'char_wb' as analyzer to capture subwords well
## Ridge - Ridge is a simple regression algorithm that will reduce overfitting 

In [None]:
def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)


In [None]:
features = FeatureUnion(
[
        ("vect1", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
        ("vect2", TfidfVectorizer(min_df= 2, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{6,}\b')),
        #('count', FunctionTransformer(get_text_length, validate=False)),
    
])

pipeline = Pipeline(
    [
        ("vect", features),
        #('scale', MaxAbsScaler()),
        #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
        ("clf", Ridge(alpha=1 )),
    ]
)

In [None]:
# Train the pipeline
df = clean(df, 'text')

pipeline.fit(df['text'], df['y'])

In [None]:
# What are the important features for toxicity

print('Total number of features:', len(pipeline['vect'].get_feature_names()) )

feature_wts = sorted(list(zip(pipeline['vect'].get_feature_names(), 
                              np.round(pipeline['clf'].coef_,2) )), 
                     key = lambda x:x[1], 
                     reverse=True)

feature_wts[:50]

In [None]:
# Extract top features from Ridge model 

f1_lst = [x.replace('vect1__','') for x,y in feature_wts[:2000] if (x.startswith('vect1__')) & (len(x.replace('vect1__','').strip())>1)]
f2_lst = [x.replace('vect2__','') for x,y in feature_wts[:2000] if (x.startswith('vect2__'))]
print(len(f1_lst),len(f2_lst))
print(f2_lst[:10])

In [None]:
features1b = FeatureUnion(
[
        ("vect1", TfidfVectorizer(analyzer = 'char_wb', vocabulary = f1_lst, ngram_range = (3,5))),
        ("vect2", TfidfVectorizer(analyzer = 'word', token_pattern=r'(?u)\b\w{6,}\b', vocabulary = f2_lst)),
        #('count', FunctionTransformer(get_text_length, validate=False)),
    
])

pipeline1b = Pipeline(
    [
        ("vect", features1b),
        #('scale', MaxAbsScaler()),
        ("clf", RandomForestRegressor(n_estimators = 50, min_samples_leaf = 3)),
        #("clf", RidgeCV(alphas=(2,1), cv = 3 )),
    ]
)

In [None]:
pipeline1b.fit(df['text'], df['y'])

In [None]:
print(len(pipeline1b['vect'].get_feature_names()))

In [None]:
#df_pred=pipeline.predict(df.text)
# df['pred'] = df_pred
# df['diff'] = np.abs(df['pred'] - df['y'])
# df.sort_values('diff',ascending=False).head(30)

feature_wts1b = sorted(list(zip(pipeline['vect'].get_feature_names(), 
                              np.round(pipeline1b['clf'].feature_importances_,2) )), 
                     key = lambda x:x[1], 
                     reverse=True)

feature_wts1b[:40]


In [None]:

del df, feature_wts, feature_wts1b
gc.collect()


## Create model 2 from Unintended Bias competition 

In [None]:
# df2 = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
# print(df2.shape)

# df2['y'] = df2[[ 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']].sum(axis=1)
# df2 = df2[['comment_text', 'y']].rename(columns={'comment_text': 'text'})
# df2.sample(5)

In [None]:
#df2.y.value_counts()

In [None]:
# df2 = pd.concat([df2[df2.y>0] , 
#                  df2[df2.y==0].sample(int(len(df2[df2.y>0])*0.5)) ], axis=0)\
#     .sample(frac=0.2)

# print(df2.shape)

In [None]:
# features2 = FeatureUnion(
# [
#         ("vect1", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
#         ("vect2", TfidfVectorizer(min_df= 2, max_df=0.5, analyzer = 'word', token_pattern=r'(?u)\b\w{6,}\b')),
#         #('count', FunctionTransformer(get_text_length, validate=False)),
     
# ])

# pipeline2 = Pipeline(
#     [
#         ("vect", features2),
#         ("sca", MaxAbsScaler()),
#         #("vect", TfidfVectorizer(min_df= 5, max_df=0.3, analyzer = 'char_wb', ngram_range = (4,6))),
#         #("clf", RandomForestRegressor(n_estimators = 5, min_sample_leaf=3)),
#         #("clf", Ridge(alpha=1)),
#         ("clf", RidgeCV(alphas=(1,3), cv = 3 )),
#         #("clf",LinearRegression())
#     ]
# )

In [None]:
# # Train the pipeline
# df2 = clean(df2, 'text')

# pipeline2.fit(df2['text'], df2['y'])

In [None]:
# # What are the important features for toxicity

# feature_wts2 = sorted(list(zip(pipeline2['vect'].get_feature_names(), 
#                                np.round(pipeline2['clf'].coef_,2) )), 
#                       key = lambda x:x[1], 
#                       reverse=True)

# feature_wts2[:50]

In [None]:

# del df2, feature_wts2
# gc.collect()


# Validate the pipeline 

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
print(df_val.shape)

df_val = clean(df_val, 'less_toxic')
df_val = clean(df_val, 'more_toxic')


### Model 1

In [None]:

p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

f'Validation Accuracy from Model 1 is { np.round((p1 < p2).mean() * 100,2)}'

### Model 2

In [None]:
p3 = pipeline1b.predict(df_val['less_toxic'])
p4 = pipeline1b.predict(df_val['more_toxic'])

f'Validation Accuracy from Model 2 is { np.round((p3 < p4).mean() * 100,2)}'

## Model1 + Model2
### Start with scaling the predictions first before combining

In [None]:
scale1 = StandardScaler()
scale2 = StandardScaler()

scale1.fit(np.hstack([p1,p2]).reshape(-1,1))
scale2.fit(np.hstack([p3,p4]).reshape(-1,1))


In [None]:
p5 = scale1.transform(p1.reshape(-1,1)) + scale2.transform(p3.reshape(-1,1))
p6 = scale1.transform(p2.reshape(-1,1)) + scale2.transform(p4.reshape(-1,1))

f'Validation Accuracy from Model 2 is { np.round((p5 < p6).mean() * 100,2)}'

## Analyze bad predictions 
### Incorrect predictions with similar scores
### Incorrect predictions with different scores

In [None]:
df_val['p1'] = p5
df_val['p2'] = p6
df_val['diff'] = np.abs(p6 - p5)

df_val['correct'] = (p5 < p6).astype('int')


### Where does most misclassification happen

#### Most come from similar predictions from less toxic comments

In [None]:
# Comparing prediction differences between less toxic and more toxic sentences

df_val = df_val.assign(diff_grp = lambda x: np.ceil(df_val['diff']),
              s1_grp = lambda x: np.ceil(df_val['p1']),
             )

display(
    df_val[df_val.correct == 0]\
    .groupby(['s1_grp']).size().reset_index()\
    .set_axis(['s1_grp','cnt'],axis='columns')\
    .sort_values('cnt', ascending=False)[:15]
)
display(
    df_val[df_val.correct == 0]\
    .groupby(['diff_grp']).size().reset_index()\
    .set_axis(['diff_grp','cnt'],axis='columns')\
    .sort_values('cnt', ascending=False)[:15]
)

display(
    df_val[df_val.correct == 0]\
    .groupby(['s1_grp','diff_grp']).size().reset_index()\
    .set_axis(['s1_grp','diff_grp','cnt'],axis='columns')\
    .sort_values('cnt', ascending=False)[:15]
)

In [None]:

### Incorrect predictions with similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=True).head(20)

#### Some of these just look incorrectly tagged 


In [None]:
### Incorrect predictions with dis-similar scores

df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20)

## Analyze the vocabulary of misclassified sentences 

In [None]:
# #df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20).more_toxic.tolist()
# comm=df_val.more_toxic[29057]
# print(comm)
# print(vect_an(comm))
# [v for v in vect_an(comm) if (v not in vocab) & (v.strip() not in obj.stop_words_)]
# vect_an(comm)


In [None]:
# Load TFIDF vectorizer
obj = pipeline["vect"].transformer_list[0][1]; print(obj)
vect_an = obj.build_analyzer()
vocab = obj.vocabulary_
print(len(vocab))

obj2 = pipeline["vect"].transformer_list[1][1]; print(obj)
vect_an2 = obj2.build_analyzer()
vocab2 = obj2.vocabulary_

tmp=[]
for comm in df_val[df_val.correct == 0].sort_values('diff', ascending=False).head(20).more_toxic.tolist():
    tmp.append((comm, vect_an(comm), vect_an2(comm)))
pd.DataFrame(tmp, columns = ["comment", "tokenized1", "tokenized2"])

# Predict on test data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


In [None]:
# Predict using pipeline
df_sub = clean(df_sub, 'text')

m1_preds = pipeline.predict(df_sub['text'])
m2_preds = pipeline1b.predict(df_sub['text'])

df_sub['score'] = scale1.transform(m1_preds.reshape(-1,1)) + scale2.transform(m2_preds.reshape(-1,1))

## Correct the rank ordering

In [None]:
# Cases with duplicates scores

df_sub['score'].count() - df_sub['score'].nunique()

In [None]:
df_sub['score'].value_counts().reset_index()[:10]

In [None]:
# Rank the predictions 

df_sub['score']  = scipy.stats.rankdata(df_sub['score'], method='ordinal')

print(df_sub['score'].rank().nunique())

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
test['score'] = df_sub.score.tolist()

In [None]:
test.sort_values('score',ascending=False).head(20)

In [None]:
test.sort_values('score',ascending=True).head(20)