In [None]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.manifold import TSNE
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

In [None]:
OOF = '../input/clrp-roberta-base-task-fineturning-train/oof_df.csv'

SEED = 28

In [None]:
def plot_target_vs_se(df, fold=0):
    idx = (df.standard_error!=0) & (df.fold==fold)
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df.loc[idx], x='target', y='standard_error', hue='fold', palette='bright')
    plt.show()

def add_features(df):
    df['error'] = df['oof'] - df['target']
    df['abs_error'] = abs(df['error'])
    return df

In [None]:
df = pd.read_csv(OOF)
df = add_features(df)
df = df.loc[df.standard_error!=0]

In [None]:
df.loc[[df['abs_error'].idxmax()], :]

In [None]:
df.loc[[df['abs_error'].idxmax()], 'excerpt']


In [None]:
df['abs_error'].idxmax()

In [None]:
df.index

In [None]:
temp_df = df.loc[df.index!=df['abs_error'].idxmax()]

In [None]:
from sklearn.metrics import mean_squared_error

def RMSE_(y_pred, y_gt):
    mse = mean_squared_error(y_pred, y_gt)
    return np.sqrt(mse)

print(RMSE_(df['target'], df['oof']))

In [None]:
print(RMSE_(temp_df['target'], temp_df['oof']))

In [None]:
plot_target_vs_se(df, fold=1)

In [None]:
plot_target_vs_se(df, fold=3)

In [None]:
cols_corr = ['target', 'oof', 'error', 'abs_error', 'standard_error']
df.loc[:, cols_corr].corr()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.loc[:, cols_corr].corr(), annot=True)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='standard_error', y='abs_error', hue='fold', palette='bright')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='target', y='abs_error', hue='fold', palette='bright')
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
sns.histplot(data=df, x='standard_error', hue='fold', palette='bright', multiple='dodge', bins=10, shrink=.8)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
sns.histplot(data=df.loc[df.standard_error!=0], x='standard_error', hue='fold', palette='bright', multiple='dodge', bins=10, shrink=.8)
plt.show()

In [None]:
tsne = TSNE(n_components=2, random_state=SEED)
tsne_target_se = tsne.fit_transform(df.loc[:, ['target', 'standard_error']])
df['tsne_0'] = tsne_target_se[:, 0]
df['tsne_1'] = tsne_target_se[:, 1]

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='tsne_0', y='tsne_1', hue='target')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='tsne_0', y='tsne_1', hue='standard_error')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='tsne_0', y='tsne_1', hue='fold', palette='bright')
plt.show()

In [None]:
def get_bin_stratified(df, n_bins=15, n_splits=5):
    df['bin'] = pd.cut(df.target, n_bins, labels=[i for i in range(n_bins)])
    
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    gen_skf = skf.split(df.id, y=df.bin)

    for fold, (idx_train, idx_val) in enumerate(gen_skf):
        df.loc[idx_val, 'fold'] = fold

    df['fold'] = df['fold'].astype('int8')
    
    return df

In [None]:
TRAIN = '../input/commonlitreadabilityprize/train.csv'
df= pd.read_csv(TRAIN)

In [None]:
def get_double_stratified(df, n_splits=5, n_bins_outer=12, n_bins_inner=5):
    df['outer_bin'] = pd.cut(df.target, n_bins_outer, labels=[i for i in range(n_bins_outer)])
    df['inner_bin'] = np.nan
    df['fold'] = np.nan

    skf = StratifiedKFold(n_splits=n_splits, random_state=SEED, shuffle=True)    
    skf_df = pd.DataFrame()
    
    for outer_bin in range(n_bins_outer):
        temp_df = df.loc[df.outer_bin==outer_bin].reset_index(drop=True)
        temp_df['inner_bin'] = pd.cut(temp_df.standard_error, n_bins_inner, labels=[i for i in range(n_bins_inner)])
        gen_skf = skf.split(temp_df.id, y=temp_df.inner_bin)
        
        for fold, (idx_tr, idx_val) in enumerate(gen_skf):
            temp_df.loc[idx_val, 'fold'] = fold
            skf_df = pd.concat([skf_df, temp_df.loc[idx_val]], axis=0)
        
    skf_df = skf_df.reset_index(drop=True)
    skf_df['fold'] = skf_df['fold'].astype('int16')
    return skf_df

In [None]:
df = get_double_stratified(df)

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df.loc[df.standard_error!=0], x='target', y='standard_error', hue='fold', palette='bright')
plt.show()

In [None]:
plot_target_vs_se(df, fold=4)

In [None]:
plot_target_vs_se(df, fold=2)