In [None]:
# Visualization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing
import string
from nltk.corpus import stopwords

import numpy as np

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sample_submission = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')
sample_submission

In [None]:
df_train = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
print(f'Train shape: {df_train.shape}\n')
print('Train info:')
df_train.info()
print()
df_train.head()

In [None]:
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
print(f'Test shape: {df_test.shape}\n')
print('Test info:')
df_test.info()
print()
df_test

In [None]:
df_train.describe(include='all')

In [None]:
df_test.describe(include='all')

Let's check cardinality of features

In [None]:
for feature in df_train.columns:
    print(f"Cardinality of {feature.upper()} in train dataset: {df_train[feature].nunique()}")
print()
for feature in df_test.columns:
    print(f"Cardinality of {feature.upper()} in test dataset: {df_test[feature].nunique()}")

Let's impute all missing values

In [None]:
df_train['url_legal'] = df_train['url_legal'].fillna('Missing')
df_train['license'] = df_train['license'].fillna('Missing')

df_test['url_legal'] = df_test['url_legal'].fillna('Missing')
df_test['license'] = df_test['license'].fillna('Missing')

In [None]:
df_train['target'].sort_values(ascending=True)

Value of target feature corresponds to complexity of text. Larger value means that text is easier, i.e. text with target value -3.676268 is the most complex. Text appropriate 1.711390 value is the easiest.

Let's check this

In [None]:
pd.set_option('display.max_colwidth', 300) # That allow us to check first 300 symbols of text in cell (replace 300 with None to see whole text)

In [None]:
df_train.sort_values('target')['excerpt'].head()

In [None]:
df_train.sort_values('target')['excerpt'].tail()

You can see that suggestion was prooved

In [None]:
pd.set_option('display.max_colwidth', 50)

# Feature engeneering

In [None]:
df_train_new = df_train.copy()
df_test_new = df_test.copy()

In [None]:
df_train_new = df_train_new.sort_values(by='target', ascending=False)
df_train_new

With creating new features we can improve our understanding of undependent features

In [None]:
df_train_new['word_count'] = df_train_new['excerpt'].apply(lambda x: len(str(x).split()))
df_test_new['word_count'] = df_test_new['excerpt'].apply(lambda x: len(str(x).split()))

df_train_new['unique_word_count'] = df_train_new['excerpt'].apply(lambda x: len(set(str(x).split())))   
df_test_new['unique_word_count'] = df_test_new['excerpt'].apply(lambda x: len(set(str(x).split())))

df_train_new['stop_words_count'] = df_train_new['excerpt'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_test_new['stop_words_count'] = df_test_new['excerpt'].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))

df_train_new['mean_word_length'] = df_train_new['excerpt'].apply(lambda x: np.mean(len(str(x).split())))
df_test_new['mean_word_length'] = df_test_new['excerpt'].apply(lambda x:  np.mean(len(str(x).split())))

df_train_new['char_count'] = df_train_new['excerpt'].apply(lambda x: len(str(x)))
df_test_new['char_count'] = df_test_new['excerpt'].apply(lambda x: len(str(x)))

df_train_new['punctuation_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
df_test_new['punctuation_count'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))


# Individual punctuation marks can also be important
df_train_new['question_mark_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '?']))
df_test_new['question_mark_count'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '?']))

df_train_new['exclamation_mark_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '!']))
df_test_new['exclamation_mark_count'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '!']))

df_train_new['comma_mark_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == ',']))
df_test_new['comma_mark_count'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == ',']))

df_train_new['point_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '.']))
df_test_new['pointCount'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '.']))

df_train_new['ellipsis_count'] = df_train_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '...']))
df_test_new['ellipsis_count'] = df_test_new['excerpt'].apply(lambda x: len([c for c in str(x) if c == '...']))


# I guess that in texts for elementary school number of pronouns is more because sentences are easier
my_stopwords = ['i', 'me', 'my', 'mine', 'you', 'your', 'yours', 'he', 'him', 'his', 'she', 'her', 'it', 'its', 'we', 'our', 'they', 'their']

df_train_new['pronoun_count'] = df_train_new['excerpt'].apply(lambda x: len([w for w in str(x).lower().split() if w in my_stopwords]))
df_test_new['pronoun_count'] = df_test_new['excerpt'].apply(lambda x: len([w for w in str(x).lower().split() if w in my_stopwords]))

In [None]:
bins = 100
plt.figure(figsize=(8,6))
plt.hist(df_train_new['target'], bins, alpha=0.5, label='target')
plt.title('Target distribution')
plt.legend(loc='upper right')
plt.show()

Target feature has normal distribution

In [None]:
meta_features = ['standard_error', 'word_count', 'unique_word_count', 'stop_words_count', 'mean_word_length', 'char_count', 'punctuation_count', 'question_mark_count',
                'exclamation_mark_count', 'comma_mark_count', 'point_count', 'ellipsis_count', 'pronoun_count']

fig, axs = plt.subplots(ncols=2, nrows=len(meta_features), figsize=(20, 50), dpi=100)

for i, feature in enumerate(meta_features):
    sns.histplot(df_train_new[feature], label=f'{feature} distribution in Training dataset', ax=axs[i][0], kde=True)
    sns.regplot(data=df_train_new, x='target', y=feature, ax=axs[i][1])
    
    for j in range(2):
        axs[i][j].set_xlabel('')
        axs[i][j].tick_params(axis='x', labelsize=12)
        axs[i][j].tick_params(axis='y', labelsize=12)
        axs[i][j].legend()
        
    axs[i][0].set_title(f'{feature} distribution in Training dataset', fontsize=13)
    axs[i][1].set_title('Target distribution', fontsize=13)
    
plt.show()

As seen word_count, unique_word_count, unique_word_count, stop_words_count, mean_word_length, char_count and comma_mark_count decreases while target feature increases. But **word_count, unique_word_count and char_count decreases** more than others. It is logical because vocabulary in primary school is small, words are simple, not very long and they often repeated.
Such features as **point_count and pronoun_count** increases while target feature increases. That's because in primary school sentences are short -> more sentences -> more points. As I suggested pronoun_count is more in primary school than in high school, because when text consists of small sentences noun quantity is small and author have to replace nouns with pronouns.
**standard_error** is higher for primary and high school.
All other features (that I haven't highlighted in bold) are not representative.