# Straitified k-fold for valid and training 

## refference
1. https://www.kaggle.com/ruchi798/commonlit-readability-prize-eda-baseline
2. https://www.kaggle.com/c/commonlitreadabilityprize/discussion?sort=votes

# Key idea
1. How to fine-tuning transformer https://huggingface.co/transformers/quicktour.html

## Suspect
1. tokenizer cased or uncased
2. x - target shall divide by standard error
3. why there is a text standard error equeal 0 ??????

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 
from pandas import DataFrame

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer as CV
from wordcloud import WordCloud,STOPWORDS

import transformers
from transformers import (
    AutoTokenizer,
)

In [None]:
train_df = pd.read_csv("../input/commonlitreadabilityprize/train.csv")
test_df = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.nunique()

In [None]:
max_text = train_df[train_df["target"] == train_df["target"].max()]['excerpt']
min_text = train_df[train_df["target"] == train_df["target"].min()]['excerpt']

In [None]:
max_text.values

In [None]:
min_text.values

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,5))
palette = ["#7209B7","#3F88C5","#136F63","#F72585","#FFBA08"]
sns.kdeplot(train_df['target'], color=palette[0], shade=True,ax=ax[0])
sns.kdeplot(train_df['standard_error'], color=palette[1], shade=True,ax=ax[1])
plt.show()

In [None]:
plt.scatter(train_df['target'], train_df['standard_error'])
plt.show()

In [None]:
train_df[train_df["target"] == 0]['excerpt'].values

# Tokenizer and N-gram

In [None]:
_tokenizer_name = 'roberta-base'
_use_slow_tokenizer = True
tokenizer = AutoTokenizer.from_pretrained(_tokenizer_name, use_fast=not _use_slow_tokenizer)

In [None]:
len(tokenizer)

In [None]:
tokenizer(["you and me"], return_special_tokens_mask=True)

In [None]:
tokenizer.decode([0, 6968, 8, 162, 2])

In [None]:
#train_df['excerpt_preprocessed'] = 
tokens = tokenizer(train_df.excerpt.values.tolist(), return_special_tokens_mask=True)
len(tokens.input_ids)

In [None]:
train_df['token_preprocessed'] = tokens.input_ids
train_df['excerpt_preprocessed'] = [tokenizer.decode(x) for x in tokens.input_ids]

In [None]:
train_df.head()

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CV().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CV(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


def get_top_n_trigram(corpus, n=None):
    vec = CV(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
def plot_bt(x,w,p):
    common_words = x(train_df['excerpt_preprocessed'], 20)
    common_words_df = DataFrame (common_words,columns=['word','freq'])

    plt.figure(figsize=(16,8))
    sns.barplot(x='freq', y='word', data=common_words_df,facecolor=(0, 0, 0, 0),linewidth=3,edgecolor=sns.color_palette(p,20))
    plt.title("Top 20 "+ w,font='Serif')
    plt.xlabel("Frequency", fontsize=14)
    plt.yticks(fontsize=13)
    plt.xticks(rotation=45, fontsize=13)
    plt.ylabel("");
    return common_words_df

In [None]:
common_words = get_top_n_words(train_df['excerpt_preprocessed'], 20)
common_words_df1 = DataFrame(common_words,columns=['word','freq'])
plt.figure(figsize=(16, 8))
ax = sns.barplot(x='freq', y='word', data=common_words_df1,facecolor=(0, 0, 0, 0),linewidth=3,edgecolor=sns.color_palette("ch:start=3, rot=.1",20))

plt.title("Top 20 unigrams",font='Serif')
plt.xlabel("Frequency", fontsize=14)
plt.yticks(fontsize=13)
plt.xticks(rotation=45, fontsize=13)
plt.ylabel("");

common_words_df2 = plot_bt(get_top_n_bigram,"bigrams","ch:rot=-.5")
common_words_df3 = plot_bt(get_top_n_trigram,"trigrams","ch:start=-1, rot=-.6")

In [None]:
# color function for the wordcloud
def color_wc(word=None,font_size=None,position=None, orientation=None,font_path=None, random_state=None):
    h = int(360.0 * 150.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(80, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

plt.subplots(figsize=(16,16))
wc = WordCloud(stopwords=STOPWORDS,background_color="white", contour_width=2, contour_color='blue',width=1500, height=750,color_func=color_wc,max_words=150, max_font_size=256,random_state=42)
wc.generate(' '.join(train_df['excerpt_preprocessed']))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

# Max and min length

In [None]:
train_df['length'] = train_df['token_preprocessed'].apply(lambda x: len(x))

In [None]:
max_len = train_df[train_df["length"] == train_df["length"].max()]
min_len = train_df[train_df["length"] == train_df["length"].min()]

In [None]:
max_len

In [None]:
min_len

In [None]:
sns.kdeplot(train_df['length'], color=palette[0], shade=True)

In [None]:
sns.jointplot(x=train_df['target'], y=train_df['length'], kind='hex',height=10,edgecolor=palette[4])
plt.subplots_adjust(top=0.95)
plt.show()

# K-fold

In [None]:
def getBin(x):
    _r = 1
    if x < -3:
         _r = 1
    elif x >= -3 and x < - 2:
         _r = 2
    elif x >= -2 and x < 0:
         _r = 3
    elif x >= 0 and x < 1:
         _r = 4
    elif x >= 1:
         _r = 5
    return _r
            
train_df["bin"] = train_df["target"].apply(getBin)

In [None]:
train_df.head()

In [None]:
fold = 0
#train_df['fold'] = fold


X = train_df
y = train_df['bin']
skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
skf.get_n_splits(X, y)
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    #X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    #y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    train_df.loc[test_index, 'fold'] = int(fold)
    fold +=1 

In [None]:
train_df.head()

In [None]:
for i in range(5):
    cur = train_df.loc[train_df["fold"] == i]
    print(i)
    print(len(cur) / len(train_df))
    print("")
    for j in range(5):
        print( len(cur.loc[cur["bin"] == int(j + 1)]) / len(cur))
        print(cur.loc[cur["bin"] == int(j + 1)]["target"].mean())
    print("-----")

In [None]:
train_df.to_csv("./updated_train.csv")