In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_val.shape

### As seen previously, there are cases in validation with contradictory ratings from annotators
### Lets see such cases

In [None]:
gp1=df_val.copy()

# Create a hash function for every unique pair
gp1['pair'] = gp1.apply(lambda x:" ".join(sorted((x['less_toxic'],
                                                  x['more_toxic']))),axis=1)
gp1['pair_hash'] = gp1.pair.apply(lambda x: str(abs(hash(x)) % (10 ** 8)))
del gp1['pair']

f'No. of rows in val_data: {len(gp1)} and no. of unique sentence pairs: {len(gp1.pair_hash.drop_duplicates())}'


## No. of cases per pair

In [None]:
gp1.groupby(['pair_hash']).size().reset_index()[0].value_counts()

- Only 108 cases with single worker
- 10000 cases with 3 workers

## Counts per unique pair
### Unique pairs occuring 3 times are decisively correct (3/3 same rating)


In [None]:
gp1['pair_cnt']=gp1.groupby(['pair_hash'])['worker'].transform(lambda x: x.count())

gp1['cnt']=gp1.groupby(['pair_hash', 
                        'less_toxic',
                        'more_toxic'])['worker'].transform(lambda x: x.count())

print(gp1[['less_toxic','more_toxic','cnt']].drop_duplicates().cnt.value_counts())


### Max possible score on val data is 0.823

In [None]:
# By this logic , max possible score is 
(4698*3 + 5302*2 + (5410-5302)) / len(df_val) 

- 4698 cases are fine - 3/3 same rating
- 5302 cases are being rated as same by 2/3 workers

### Cases with 3 unique rating and 1 disagreement

In [None]:
pd.options.display.max_colwidth = 200
gp1[(gp1.pair_cnt == 3) & (gp1.cnt == 1)]

## Since LB scores are already ~0.85 , Val and test data are quite different.
### It is possible that test data is cleaner and doesnt have cases with contradictory ratings

- In thats case , removing the cases with contradictory ratings: 

In [None]:
df_val2 = gp1[~((gp1.pair_cnt == 3) & (gp1.cnt == 1))][['worker', 'less_toxic', 'more_toxic']]

In [None]:
df_val2.shape

### Test data could also be unique cases of these pairs

In [None]:
df_val3 = df_val2[['less_toxic', 'more_toxic']].drop_duplicates()
df_val3.shape

### Validation scores

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge,RidgeCV, LinearRegression
from sklearn.pipeline import Pipeline, FeatureUnion


In [None]:
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
print(df.shape)

# Give more weight to severe toxic 
df['severe_toxic'] = df.severe_toxic * 3
df['threat'] = df.threat * 2

df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) ).astype(int)
df = df[['comment_text', 'y']].rename(columns={'comment_text': 'text'})

# Reduce rows with 0 toxicity
df = pd.concat([df[df.y>0] , 
                df[df.y==0].sample(int(len(df[df.y>0])*1.5)) ], axis=0).sample(frac=1)

print(df.shape)

In [None]:
features = FeatureUnion(
[
        ("vect1", TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))),
])

pipeline = Pipeline(
    [
        ("vect", features),
        ("clf", Ridge(alpha=1 )),
    ]
)

In [None]:
pipeline.fit(df['text'], df['y'])


### Validate on __actual__ val data

In [None]:
p1 = pipeline.predict(df_val['less_toxic'])
p2 = pipeline.predict(df_val['more_toxic'])

f'Validation Accuracy from Model 1 is { np.round((p1 < p2).mean() * 100,2)}'

### Validate on __modified__ val2 data

In [None]:
p1_m = pipeline.predict(df_val2['less_toxic'])
p2_m = pipeline.predict(df_val2['more_toxic'])

f'Validation Accuracy from Model 1 is { np.round((p1_m < p2_m).mean() * 100,2)}'

### Validate on __modified__ val3 data

In [None]:
p1_m3 = pipeline.predict(df_val3['less_toxic'])
p2_m3 = pipeline.predict(df_val3['more_toxic'])

f'Validation Accuracy from Model 1 is { np.round((p1_m3< p2_m3).mean() * 100,2)}'

## Predict on test data

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

m1_preds = pipeline.predict(df_sub['text'])

df_sub['score'] = m1_preds

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

## The validation score is much better aligned with LB now