In [None]:
# packages
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read files
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sub = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

In [None]:
# preview training data
df_train.head()

In [None]:
# show (full) test set
df_test

In [None]:
# more details for training data
df_train.info()

In [None]:
# basis stats for numerical columns
df_train.describe()

# Target Exploration

In [None]:
# plot target
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,6))
ax1.hist(df_train.target, bins=50)
ax1.grid()
ax1.set_title('Target')
ax2.boxplot(df_train.target, vert=False)
ax2.grid()   
ax2.set_title('Target - Boxplot')
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,6))
ax1.hist(df_train.standard_error, bins=50)
ax1.grid()
ax1.set_title('Standard Error')
ax2.boxplot(df_train.standard_error, vert=False)
ax2.grid()   
ax2.set_title('Standard Error - Boxplot')
plt.show()

#### Not sure, what the purpose of the standard error column is here. Maybe it could be used as a weight in training (higher standard error ~ lower weight)?

In [None]:
# scatter plot standard error / target
plt.scatter(df_train.target, df_train.standard_error)
plt.title('Standard Error vs Target')
plt.xlabel('Target')
plt.ylabel('Standard Error')
plt.grid()
plt.show()

#### Nice smile :-)

In [None]:
# let's look at the "outlier":
df_train[df_train.standard_error==0]

#### We can probably delete this row.

In [None]:
# remove "outlier" row
df_train = df_train[df_train.standard_error!=0]

# Feature Engineering

In [None]:
# add a few features
df_train['n_char'] = df_train.excerpt.str.len()
df_train['n_word'] = df_train.excerpt.str.split().map(lambda x : len(x))
df_train['char_per_word'] = df_train.n_char / df_train.n_word

In [None]:
# plot distributions of new features
df_train.n_char.plot(kind='hist', bins=25)
plt.title('Number of characters')
plt.grid()
plt.show()

df_train.n_word.plot(kind='hist', bins=25)
plt.title('Number of words')
plt.grid()
plt.show()

df_train.char_per_word.plot(kind='hist', bins=25)
plt.title('Characters per word')
plt.grid()
plt.show()

# Correlations

In [None]:
# correlations
corr_pearson = df_train[['n_char','n_word','char_per_word',
                         'target','standard_error']].corr(method='pearson')
fig = plt.figure(figsize = (6,5))
sns.heatmap(corr_pearson, annot=True, 
            cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

In [None]:
# target vs number of characters
sns.jointplot(data=df_train, x='n_char', y='target', 
              kind='reg', scatter_kws = {'alpha': 0.25})
plt.show()

In [None]:
# target vs number of words
sns.jointplot(data=df_train, x='n_word', y='target',
              kind='reg', scatter_kws = {'alpha': 0.25})
plt.show()

In [None]:
# target vs characters per word
sns.jointplot(data=df_train, x='char_per_word', y='target', 
              kind='reg', scatter_kws = {'alpha': 0.25})
plt.show()

### We see quite some correlation between the target and our new features. This could be used to build a first simple baseline model.

# Extreme cases

### Most readable examples:

In [None]:
top5 = df_train.nlargest(5,columns=['target'])
top5

In [None]:
# show full text
for i in range(5):
    print(top5.reset_index().excerpt[i])
    print()

### Most difficult examples:

In [None]:
bot5 = df_train.nsmallest(5,columns=['target'])
bot5

In [None]:
# show full text
for i in range(5):
    print(bot5.reset_index().excerpt[i])
    print()