<h1> CommonLit Readability Prize EDA </h1>
<br>

In this notebook I'll perform some exploratory data analysis, trying to update it often. 

<h4 style="background-color:#e6f7ff;" align = 'center'><i>Table of Contents</i></h4>

- [files available and some info](#files)
- [train.csv](#train)
- [test.csv](#test)


<a id = "files"></a>

<h5 style="background-color:#e6f7ff;" align = 'center'><i>Files available and some info</i></h5>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.linear_model import LinearRegression
import spacy
nlp = spacy.load("en_core_web_sm")
import time
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import os
root_path = '/kaggle/input/commonlitreadabilityprize/'
os.listdir(root_path)

In [None]:
train = pd.read_csv(root_path + "/train.csv")
test = pd.read_csv(root_path + "/test.csv")
sample_submission = pd.read_csv(root_path + "/sample_submission.csv")

display(train.info())
print("\n\n")
display(test.info())
print("\n\n")
display(sample_submission.info())

In [None]:
display(train.sample(), test.sample(), sample_submission.sample())

<a id ="train"></a>
<h5 style="background-color:#e6f7ff;" align = 'center'style="background-color:#e6f7ff;" align = 'center'> <i>train.csv</i> </h5>

In [None]:
original_cols = ['id', 'url_legal', 'license', 'excerpt', 'target', 'standard_error']

train_stats = (pd.concat([train[original_cols].apply(lambda x: x.nunique(), axis = 0)
                          .rename("distinct_values").to_frame(),
                          train[original_cols].apply(lambda x: x.notna().sum(), axis = 0)
                          .rename("not_nan_values").to_frame()], 1)
              .reset_index().rename({'index': 'variable'}, axis = 1))

train_stats['distinct_over_notnan_percentage'] = (train_stats['distinct_values']/train_stats['not_nan_values']).round(2)

fig, ax = plt.subplots(1, 2, figsize = (14, 6), gridspec_kw={'width_ratios': [1.2, 1]})

plt.style.use('fivethirtyeight')

sns.set_context(rc = {'patch.linewidth': 2.0})
sns.barplot(x = 'variable', 
            y = 'distinct_values',
            data = train_stats,
            edgecolor = 'black',
            linewidth = 2,
            palette="pastel",
            ax = ax[0])

for index, row in train_stats.iterrows():
    value = row.distinct_values
    ax[0].text(index, value+20, value, color='black', ha="center", 
               fontsize = 15, fontweight = 'bold')

ax[0].grid(True)
ax[0].legend(fontsize=18)
ax[0].set_title('Distinct values for each variable', fontsize = 18, fontweight = 'bold')
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)
ax[0].set_xlabel('')
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation = 35, fontsize = 13, color = 'black')
ax[0].set_ylabel('distinct_values', fontsize = 18, color ='black')
plt.subplots_adjust(hspace = 0.3)

sns.barplot(x = 'variable', 
            y = 'not_nan_values',
            data = train_stats,
            palette="pastel",
            linewidth = 2,
            edgecolor = 'black',
            ax = ax[1])

for index, row in train_stats.iterrows():
    value = row.not_nan_values
    ax[1].text(index, value+20, value, color='black', ha="center", fontsize = 15, fontweight = 'bold')
    
ax[1].grid(True)
ax[1].set_title('Not NaN values for each variable', fontsize = 18, fontweight = 'bold')
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)
ax[1].set_xlabel('')
ax[1].set_xticklabels(ax[0].get_xticklabels(), rotation = 35, fontsize = 13, color = 'black')
ax[1].set_ylabel('not_nan_values', fontsize = 18, color ='black')

fig,ax = plt.subplots(1, 1, figsize = (20, 8))

bbox=[-0.2, 0, 1.2, 0.9]
ax.axis('off')
ax.title.set_text('')
ccolors = plt.cm.BuPu(np.full(len(train_stats.columns), 0.1))

mpl_table = ax.table(cellText = train_stats.values, bbox=bbox, colLabels=train_stats.columns, colColours=ccolors)
mpl_table.auto_set_font_size(False)
mpl_table.auto_set_column_width(col=list(range(len(train_stats.columns))))
mpl_table.set_fontsize(18)

In [None]:
assert len(train.loc[(train.url_legal.notna()) & (train.license.isna())] ) == 0

url_legal and license will be missing in the private test set. 

In [None]:
plt.style.use('ggplot')
fig, ax = plt.subplots(2, 1, figsize = (20, 12))

fig.suptitle("Distribution when url_legal is NaN or not", fontsize = 20, fontweight = 'bold')
plt.title("in private set we won't have url_legal nor license", fontsize = 12)

sns.histplot(data = train.loc[train.url_legal.notna()], x = 'target', 
             ax = ax[0], kde=True, bins = 50,
             stat = 'density', color = 'red', edgecolor = 'black',
             alpha = 0.3, label = 'url_legal not NaN', 
             linewidth = 3, line_kws= {'linewidth': 3})

sns.histplot(data = train.loc[train.url_legal.isna()], x = 'target', 
             ax = ax[0], kde=True, bins = 50, stat = 'density', 
             alpha = 0.3,  label = 'url_legal NaN', edgecolor = 'black',
             linewidth = 3, line_kws= {'linewidth': 3})

ax[0].legend(fontsize=18)
ax[0].set_xlabel('target', fontsize = 18)
ax[0].set_ylabel('Density', fontsize = 18)
ax[0].set_title('target distribution with url_legal NaN or not', fontsize = 15)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)

sns.histplot(data = train.loc[train.url_legal.notna()], x = 'standard_error', 
             ax = ax[1], kde=True, bins = 50,
             stat = 'density', color = 'red',
             alpha = 0.3, label = 'url_legal not NaN',
             edgecolor = 'black',
             linewidth = 3, line_kws= {'linewidth': 3})

sns.histplot(data = train.loc[train.url_legal.isna()], x = 'standard_error', 
             ax = ax[1], kde=True, bins = 50, stat = 'density', 
             alpha = 0.3,  label = 'url_legal NaN',
             edgecolor = 'black',
             linewidth = 3, line_kws= {'linewidth': 3})

ax[1].legend(fontsize=18)
ax[1].set_xlabel('standard_error', fontsize = 18)
ax[1].set_ylabel('Density', fontsize = 18)
ax[1].set_xlim(0.4, 0.7)
ax[1].set_title('standard_error distribution with url_legal NaN or not', 
                fontsize = 15)
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)

plt.subplots_adjust(hspace = 0.3)

There is a little shift to the right in the `target` distribution when `url_legal` is not NaN.

In [None]:
# target,standard_error distributions including also violin plots
# aggiungere text con percentili e violinplot

plt.style.use('ggplot')

fig, ax = plt.subplots(1, 2, figsize = (16, 6), gridspec_kw={'width_ratios': [1.5, 0.6]})

percentiles_asked = [0.1, 0.25, 0.5, 0.75, 0.9]
percentiles = train['target'].quantile(percentiles_asked).tolist()

sns.histplot(data = train, x = 'target', ax = ax[0], kde=False, bins = 50,
             stat = 'density', 
             alpha = 0.5, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
             #line_kws= {'linewidth': 5, 'color': 'red', 'alpha': 0.6}
            )

sns.kdeplot(data = train, x = 'target', ax = ax[0], alpha = 0.01, fill = True, 
            linewidth = 5, color = 'blue')

for m, percentile in enumerate(percentiles):
        ax[0].axvline(percentile, alpha = 0.35, ymin = 0, ymax = 1, linestyle = ":", color = 'blue')
        ax[0].text(percentile-0.16, 0.43, "{}".format(percentiles_asked[m]), size = 12, alpha = 1)
        
mean = train.target.mean().round(2)
median = train.target.median().round(2)
st_dev = train.target.std().round(2)

ax[0].text(-4.4, 0.4, "mean: {}".format(mean), size = 12, alpha = 1)
ax[0].text(-4.4, 0.36, "median: {}".format(median), size = 12, alpha = 1)
ax[0].text(-4.4, 0.32, "std deviation: {}".format(st_dev), size = 12, alpha = 1)

#https://stackoverflow.com/questions/49926147/how-to-modify-edge-color-of-violinplot-using-seaborn/55131881 
#per cambiare colore linea esterna
sns.violinplot(y='target', data = train, ax=ax[1], inner = 'quartile',)
for l in ax[1].lines:
    l.set_linestyle('--')
    l.set_color('yellow')
    l.set_alpha(0.2)

ax[0].set_ylabel('Density', fontsize = 15)
ax[1].set_ylabel('target', fontsize = 15)
ax[0].set_xlabel('target', fontsize = 15)
ax[0].set_title('hist-kde plot', fontsize = 16)
ax[1].set_title('violin plot with quartiles', fontsize = 16)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)
#plt.subplots_adjust(hspace = 0.8)
fig.suptitle('Distribution of variable target', fontsize = 20, fontweight = 'bold')

In [None]:
# target,standard_error distributions including also violin plots
# aggiungere text con percentili e violinplot
plt.style.use('ggplot')

fig, ax = plt.subplots(1, 2, figsize = (16, 6), gridspec_kw={'width_ratios': [1.5, 0.6]})

percentiles_asked = [0.1, 0.25, 0.5, 0.75, 0.9]
percentiles = train['standard_error'].quantile(percentiles_asked).tolist()
print(percentiles)

sns.histplot(data = train, x = 'standard_error', ax = ax[0], kde=False, bins = 50,
             stat = 'density', 
             alpha = 0.5, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
            )

sns.kdeplot(data = train, x = 'standard_error', ax = ax[0], alpha = 0.01, fill = True, 
            linewidth = 3, color = 'blue')

#for m, percentile in enumerate(percentiles):
#        ax[0].axvline(percentile, alpha = 0.5, ymin = 0, ymax = 1, linestyle = ":", color = 'blue')
#        ax[0].text(percentile-0.01, 16.5, "{}".format(percentiles_asked[m]), size = 12, alpha = 1)

sns.violinplot(y='standard_error', data = train, ax=ax[1], inner = 'quartile')
for l in ax[1].lines:
    l.set_linestyle('--')
    l.set_color('yellow')
    l.set_alpha(0.3)
    
mean = train.standard_error.mean().round(2)
median = train.standard_error.median().round(2)
st_dev = train.standard_error.std().round(2)

ax[0].text(0.0, 16, "mean: {}".format(mean), size = 12, alpha = 1)
ax[0].text(0.0, 14, "median: {}".format(median), size = 12, alpha = 1)
ax[0].text(0.0, 12, "std deviation: {}".format(st_dev), size = 12, alpha = 1)

ax[0].set_ylabel('Density', fontsize = 15)
ax[1].set_ylabel('standard_error', fontsize = 15)
ax[0].set_xlabel('standard_error', fontsize = 15)
ax[0].set_title('hist-kde plot', fontsize = 16)
ax[1].set_title('violin plot with quartiles', fontsize = 16)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)
#ax[0].set_xlim(0.4, 0.7)
ax[0].set_ylim(0, 17)
fig.suptitle('Distribution of variable standard_error', fontsize = 20, fontweight = 'bold')
plt.subplots_adjust(hspace = 0.6)

ax[0].annotate('crazy outlier in standard_error', xy=(0.0, 0),  xycoords='data',
            xytext=(0.4, 0.2), textcoords='axes fraction', fontsize = 14,
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top',
            )

In [None]:
from sklearn.preprocessing import StandardScaler

st_sc = StandardScaler()
# comparison between the 2 after standardizing

# target,standard_error distributions including also violin plots
# aggiungere text con percentili e violinplot
plt.style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize = (16, 6))#, gridspec_kw={'height_ratios': [0.6, 1.5]})

#sns.histplot(x = train['standard_error'],  bins = 20,
#            ax = ax[0], alpha = 0.25, fill = True, label = 'standard_error', 
#            linewidth = 3, color = 'blue')
#
#sns.histplot(x = train['target'], bins = 10,
#            ax = ax[0], alpha = 0.25, fill = True, label = 'target', 
#            linewidth = 3, color = 'red', common_norm = True)

sns.kdeplot(x = st_sc.fit_transform(train[['standard_error']])[:, 0], 
            ax = ax, alpha = 0.25, fill = True, label = 'standard_error_normalized', 
            linewidth = 3, color = 'blue')

sns.kdeplot(x = st_sc.fit_transform(train[['target']])[:, 0], 
            ax = ax, alpha = 0.25, fill = True, label = 'target_normalized', 
            linewidth = 3, color = 'red')

ax.legend(fontsize = 20, loc = 'upper left')

ax.annotate('crazy outlier in standard_error', xy=(-14, 0),  xycoords='data',
            xytext=(0.3, 0.1), textcoords='axes fraction', fontsize = 14,
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top',
            )

ax.set_ylabel('Density', fontsize = 15)
fig.suptitle('Distribution of target and standard_error after normalization', 
             fontsize = 20, fontweight = 'bold')

plt.subplots_adjust(hspace = 0.6)

In [None]:
# score based on number of letters/words
fig, ax = plt.subplots(2, 1, figsize = (20, 14))
sns.scatterplot(y = 'target', 
            x = 'standard_error',
            data = train, 
            ax = ax[0], 
            color = 'blue', 
            sizes = [5],
               alpha = 0.3)

ax[0].set_title("target vs standard_error")
ax[0].set_ylabel('target', fontsize = 15)
ax[0].set_xlabel('standard_error', fontsize = 15)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)

fig.suptitle("target vs standard_error", fontsize = 20, fontweight = 'bold')

ax[0].set_ylim(-4, 2)
ax[0].set_xlim(0, 1)

ax[0].annotate('crazy outlier in standard_error', xy=(0, 0),  xycoords='data',
            xytext=(0.4, 0.95), textcoords='axes fraction',fontsize = 14,
            arrowprops=dict(facecolor='black', shrink=0.05),
            horizontalalignment='right', verticalalignment='top',
            )

x = st_sc.fit_transform(train[['standard_error']])[:, 0]
y = st_sc.fit_transform(train[['target']])[:, 0]

sns.scatterplot(y = y, 
            x = x,
            ax = ax[1], 
            color = 'blue', 
            sizes = [5],
               alpha = 0.3)

ax[1].set_title("target vs standard_error after normalization")
ax[1].set_ylabel('target', fontsize = 15)
ax[1].set_xlabel('standard_error', fontsize = 15)
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)

**Higher standard error for more extreme values of target**

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression(fit_intercept = True)

lr.fit(y = train['target'], X = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len().to_frame())
fitted = lr.predict(X = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len().to_frame())

fig, ax = plt.subplots(1, 1, figsize = (16, 8))
sns.scatterplot(y = train['target'], x = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "", regex=True).str.len(),
                ax = ax, color= 'blue', sizes= [5], alpha= 0.3)

sns.lineplot(x = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "", regex = True).str.len(),
             y = fitted, ax = ax, linewidth = 5, color = 'red')

ax.set_ylabel('target', fontsize = 15)
ax.set_xlabel('number of letters in excerpt', fontsize = 15)
ax.set_title('Negative correlation between number of letters in excerpt and target', fontsize = 13)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)

fig.suptitle("Number of letters in excerpt vs target (fitted with Sklearn)", fontsize = 20, fontweight = 'bold')


In [None]:
fig, axes = plt.subplots(3, 2, figsize = (16, 16))

ax = axes.ravel()

sns.regplot(y = train['target'], x = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len(),
            ax = ax[0], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, 
            line_kws = {'color': 'red', 'linewidth': 3, 'alpha': 0.5})

sns.regplot(y = train['standard_error'], x = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len(),
            ax = ax[1], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, 
            line_kws = {'color': 'red', 'linewidth': 3, 'alpha': 0.5})



lr = LinearRegression()


ax[0].set_ylabel('target', fontsize = 15)
ax[0].set_xlabel('Number of letters in Excerpt', fontsize = 15)
ax[0].set_title('Number of letters in excerpt vs target', fontsize = 16)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)

ax[1].set_ylabel('standard_error', fontsize = 15)
ax[1].set_xlabel('Number of letters in Excerpt', fontsize = 15)
ax[1].set_title('Number of letters in excerpt vs standard_error', fontsize = 16)
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)
ax[1].set_ylim(0.425, 0.65)


sns.regplot(y = train['target'], x = train['excerpt'].str.split().str.len(),
                ax = ax[2], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})

sns.regplot(y = train['standard_error'], x = train['excerpt'].str.split().str.len(),
                ax = ax[3], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})


ax[2].set_ylabel('target', fontsize = 15)
ax[2].set_xlabel('Number of words in Excerpt', fontsize = 15)
ax[2].set_title('Number of words in excerpt vs target', fontsize = 16)
ax[2].tick_params(axis='both', which='major', labelsize=14)
ax[2].tick_params(axis='both', which='minor', labelsize=14)

ax[3].set_ylabel('standard_error', fontsize = 15)
ax[3].set_xlabel('Number of words in Excerpt', fontsize = 15)
ax[3].set_title('Number of words in excerpt vs standard_error', fontsize = 16)
ax[3].tick_params(axis='both', which='major', labelsize=14)
ax[3].tick_params(axis='both', which='minor', labelsize=14)
ax[3].set_ylim(0.425, 0.65)

sns.regplot(y = train['target'], x = train['excerpt'].str.split(pat='[.!?]+').str.len(),
                ax = ax[4], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})

sns.regplot(y = train['standard_error'], x = train['excerpt'].str.split(pat='[.!?]+').str.len(),
                ax = ax[5], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})


ax[4].set_ylabel('target', fontsize = 15)
ax[4].set_xlabel('Number of sentences in Excerpt', fontsize = 15)
ax[4].set_title('Number of sentences in excerpt vs target', fontsize = 16)
ax[4].tick_params(axis='both', which='major', labelsize=14)
ax[4].tick_params(axis='both', which='minor', labelsize=14)

ax[5].set_ylabel('standard_error', fontsize = 15)
ax[5].set_xlabel('Number of sentences in Excerpt', fontsize = 15)
ax[5].set_title('Number of sentences in excerpt vs standard_error', fontsize = 16)
ax[5].tick_params(axis='both', which='major', labelsize=14)
ax[5].tick_params(axis='both', which='minor', labelsize=14)
ax[5].set_ylim(0.425, 0.65)

plt.subplots_adjust(hspace = 0.3)

fig.suptitle("Number of letters/words/sentences in excerpt vs target/standard_error (using sns.regplot)", 
             fontsize = 20, fontweight = 'bold')

In [None]:
corr_df = train.copy()

corr_df['n_letters_excerpt'] = corr_df['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len()
corr_df['n_words_excerpt'] = corr_df['excerpt'].str.split().str.len()
corr_df['n_sentences_excerpt'] = corr_df['excerpt'].str.split(pat = '[.!?]+').str.len()

corr_df = corr_df[['target', 'standard_error', 'n_letters_excerpt', 'n_words_excerpt', 'n_sentences_excerpt']]

corr_matrix = round(corr_df.corr(), 2)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
fig, ax = plt.subplots(1, 1, figsize = (10, 10))
colors = sns.color_palette('rocket', 21)
levels = np.linspace(-1, 1, 21)
cmap_plot, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")
sns.heatmap(corr_matrix, mask=mask, annot=True, ax = ax, 
            cmap = cmap_plot, norm = norm, annot_kws={"size": 15, "color": 'black'})
ax.hlines([0, 1, 2, 3, 4], *ax.get_xlim(), color = 'black')
ax.vlines([0, 1, 2, 3, 4], *ax.get_ylim(), color = 'black')
ax.xaxis.set_ticks_position('bottom')
ax.set_title('Distinct values for each variable', fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 35, fontsize = 15, color = 'black')
ax.set_yticklabels(ax.get_yticklabels(), rotation = 35, fontsize = 15, color = 'black')
ax.xaxis.label.set_size(14)

circle_rad = 25  # This is the radius, in points
ax.plot(0.5, 2.5, 'o',
        ms=circle_rad * 2, mec='w', mfc='none', mew=4)

fig.suptitle('Correlation Matrix for {}'.format('train.csv'), 
             fontsize = 20, color = 'black', fontweight = 'bold')
plt.title("Just to sum up: as seen before there's a negative correlation between number of letters and target", fontsize = 12)
fig.show()
    

In [None]:
corr_df['average_word_length'] = (corr_df['n_letters_excerpt']/corr_df['n_words_excerpt']).round(3)
corr_df['average_sentence_length'] = (corr_df['n_sentences_excerpt']/corr_df['n_words_excerpt']).round(3)

corr_matrix = round(corr_df.corr(), 2)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
fig, ax = plt.subplots(1, 1, figsize = (8, 8))
colors = sns.color_palette('rocket', 21)
levels = np.linspace(-1, 1, 21)
cmap_plot, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")
sns.heatmap(corr_matrix, mask=mask, annot=True, ax = ax, 
            cmap = cmap_plot, norm = norm, annot_kws={"size": 15, "color": 'black'})
ax.hlines([0, 1, 2, 3, 4, 5, 6], *ax.get_xlim(), color = 'black')
ax.vlines([0, 1, 2, 3, 4, 5, 6], *ax.get_ylim(), color = 'black')
ax.xaxis.set_ticks_position('bottom')
ax.set_title('Distinct values for each variable', fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, fontsize = 12, color = 'black')
ax.set_yticklabels(ax.get_yticklabels(), rotation = 35, fontsize = 15, color = 'black')
ax.xaxis.label.set_size(14)

circle_rad = 25  # This is the radius, in points
ax.plot(0.5, 5.5, 'o',
        ms=circle_rad * 2, mec='w', mfc='none', mew=4)

fig.suptitle('Correlation Matrix for {}, after creating average_word_length'.format('train.csv'), 
             fontsize = 20, color = 'black', fontweight = 'bold')
plt.title("An even more negative correlation between average_length and target", fontsize = 12)
fig.show()
    

I expect more difficult scores when the number of rare words is higher... 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
cvect = CountVectorizer(binary = True)
bag_of_words = cvect.fit_transform(train.excerpt)
print("Total Number of words (no stemming nor lemmatization): {}".format(bag_of_words.shape[1]))

In [None]:
counts = bag_of_words.sum(axis = 0)
df_counts = pd.DataFrame({'counts': np.squeeze(np.asarray(counts)), 'word': cvect.get_feature_names()})
display(df_counts.sort_values('counts', ascending = False, ignore_index = True))

I choose the 10% most rare words

In [None]:
most_rare = df_counts.iloc[int(0.9*len(df_counts)):]['word'].tolist()
print("Some 'rare' words: \n")
np.random.choice(most_rare, 5).tolist()

In [None]:
# score based on number of letters/words

def how_many_rare(excerpt, most_rare, list_like = True):
    if list_like:
        #list_like may take some time (< 1 minute)
        return len([i for i in excerpt.split() if i in most_rare])
    else:
        return len(set(excerpt.split()).intersection(set(most_rare)))

train['rare_words'] = train.apply(lambda x: how_many_rare(x.excerpt, most_rare), 1)
train['rare_words_over_nwords'] = train['rare_words']/train.excerpt.str.split().apply(lambda x: len(x))


fig, ax = plt.subplots(2, 1, figsize = (12, 12))
sns.regplot(y = train['target'], x = train['rare_words'],
                ax = ax[0], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})

ax[0].set_ylabel('target', fontsize = 15)
ax[0].set_xlabel('', fontsize = 15)
ax[0].set_title('Number of rare words in excerpt vs target', fontsize = 16)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)

sns.regplot(y = train['standard_error'], x = train['rare_words'],
                ax = ax[1], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})

ax[1].set_ylabel('standard_error', fontsize = 15)
ax[1].set_xlabel('', fontsize = 15)
ax[1].set_title('Number of rare words in excerpt vs standard_error', fontsize = 16)
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)
ax[1].set_ylim(0.4, 0.7)

fig.suptitle("Number of 'rare words' vs target", fontsize = 20, fontweight = 'bold')

<a id = "test"></a>

<h5 style="background-color:#e6f7ff;" align = 'center'style="background-color:#e6f7ff;" align = 'center'> <i>test.csv</i> </h5>

In [None]:
test

In [None]:
test['n_letters_excerpt'] = test['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len()
test['n_words_excerpt'] = test['excerpt'].str.split().str.len()
test['n_sentences_excerpt'] = test['excerpt'].str.split(pat = '[.!?]+').str.len()


In [None]:
test

In [None]:
# target,standard_error distributions including also violin plots
# aggiungere text con percentili e violinplot
plt.style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize = (16, 6))

percentiles_asked = [0.1, 0.25, 0.5, 0.75, 0.9]
percentiles = corr_df['n_letters_excerpt'].quantile(percentiles_asked).tolist()

sns.histplot(data = corr_df, x = 'n_letters_excerpt', ax = ax, kde=False, bins = 50,
             stat = 'density', 
             alpha = 0.2, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
            )

sns.kdeplot(data =corr_df, x = 'n_letters_excerpt', ax = ax, alpha = 0.01, fill = True, 
            linewidth = 3, color = 'blue')

for idx, row in test.sort_values('n_letters_excerpt', ignore_index=True).iterrows():
    
    ax.vlines(x = row['n_letters_excerpt'], ymin = 0, ymax = 0.0045, colors = 'red', linewidth = 2, label = row.id,
             linestyles = 'dashed')
    if idx <2:
        ax.text(x = row['n_letters_excerpt']-40, y = 0.0047, s=row.id, fontsize = 9)
    if idx >4:
        ax.text(x = row['n_letters_excerpt'], y = 0.0047, s=row.id, fontsize = 9)
    if idx == 2:
        ax.text(x = row['n_letters_excerpt']-40, y = 0.0046, s=row.id, fontsize = 9)
    if idx == 3:
        ax.text(x = row['n_letters_excerpt']-30, y = 0.00475, s=row.id, fontsize = 9)
    if idx == 4:
        ax.text(x = row['n_letters_excerpt']-30, y = 0.0046, s=row.id, fontsize = 9)
    

ax.set_ylabel('Density', fontsize = 15)
ax.set_xlabel('n_letters_excerpt', fontsize = 15)
ax.set_ylim(0.0, 0.005)
ax.set_title('hist-kde plot', fontsize = 16)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)

fig.suptitle('Distribution of number of letters in excerpt: train vs test with corresponding ids', fontsize = 20, fontweight = 'bold')
plt.subplots_adjust(hspace = 0.6)



In [None]:
# target,standard_error distributions including also violin plots
# aggiungere text con percentili e violinplot
plt.style.use('ggplot')

fig, ax = plt.subplots(1, 1, figsize = (16, 6))

percentiles_asked = [0.1, 0.25, 0.5, 0.75, 0.9]
percentiles = corr_df['n_words_excerpt'].quantile(percentiles_asked).tolist()

sns.histplot(data = corr_df, x = 'n_words_excerpt', ax = ax, kde=False, bins = 25,
             stat = 'density', 
             alpha = 0.2, 
             fill = True,
             linewidth = 3,
             edgecolor='black',
             color = 'red',
            )

sns.kdeplot(data =corr_df, x = 'n_words_excerpt', ax = ax, alpha = 0.01, fill = True, 
            linewidth = 3, color = 'blue')

for idx, row in test.sort_values('n_words_excerpt', ignore_index=True).iterrows():
    
    ax.vlines(x = row['n_words_excerpt'], ymin = 0, ymax = 0.025, colors = 'red', linewidth = 2, label = row.id,
             linestyles = 'dashed')
    
    if idx == 0:
        ax.text(x = row['n_words_excerpt']-5, y = 0.026, s=row.id, fontsize = 9)
    if idx == 1:
        ax.text(x = row['n_words_excerpt']-2, y = 0.028, s=row.id, fontsize = 9)
    if idx == 5:
        ax.text(x = row['n_words_excerpt']-3, y = 0.026, s=row.id, fontsize = 9)
    if idx == 6:
        ax.text(x = row['n_words_excerpt']-1, y = 0.028, s=row.id, fontsize = 9)
    if (idx>=2) & (idx<=4):
        ax.text(x = row['n_words_excerpt']-3, y = 0.026, s=row.id, fontsize = 9)

ax.set_ylabel('Density', fontsize = 15)
ax.set_xlabel('n_words_excerpt', fontsize = 15)
ax.set_ylim(0.0, 0.03)
ax.set_title('hist-kde plot', fontsize = 16)
ax.tick_params(axis='both', which='major', labelsize=14)
ax.tick_params(axis='both', which='minor', labelsize=14)

fig.suptitle('Distribution of number of words in excerpt: train vs test with corresponding ids', fontsize = 20, fontweight = 'bold')
plt.subplots_adjust(hspace = 0.6)



In [None]:
train['sentence'] = train.excerpt.str.split(pat = '[.!?]+')

In [None]:
def tokenize_sentences(x: list):
    tags = []
    for sentence in x:
        pos_tag = [i[1] for i in nltk.pos_tag(nltk.word_tokenize(sentence), tagset= 'universal')]
        tags+=pos_tag
    return tags

def tokenize_sentences(x: list):
    tags = []
    for sentence in x:
        pos_tag = [token.pos_ for token in nlp(sentence)]
        tags+=pos_tag
    return tags

In [None]:
start_time = time.time()
train['pos_tag'] = train.sentence.apply(lambda x: tokenize_sentences(x))
print("Time Elapsed: {}".format(time.time()-start_time))

In [None]:
train['number_of_verbs'] = train['pos_tag'].apply(lambda x: len([i for i in x if i == 'VERB'])/len(x))

In [None]:
# score based on number of letters/words

fig, ax = plt.subplots(1, 2, figsize = (16, 12), gridspec_kw={'width_ratios': [2, 1]})
sns.regplot(y = train['target'], x = train['number_of_verbs'],
                ax = ax[0], scatter_kws = {'color': 'blue', 'sizes': [5], 'alpha': 0.3}, line_kws = {'color': 'red', 
                                                                      'linewidth': 3, 'alpha': 0.5})

ax[0].set_ylabel('target', fontsize = 15)
ax[0].set_xlabel('', fontsize = 15)
ax[0].set_title('Percentage of verbs in excerpt vs target', fontsize = 16)
ax[0].tick_params(axis='both', which='major', labelsize=14)
ax[0].tick_params(axis='both', which='minor', labelsize=14)

fig.suptitle("Percentage of verbs vs target/standard_error", fontsize = 20, fontweight = 'bold')
plt.title("More verbs seem to make the comprehension easier", fontsize = 12, fontweight = 'bold')

corr_df = train.copy()

corr_df = corr_df[['target', 'standard_error', 'number_of_verbs']]

corr_matrix = round(corr_df.corr(), 2)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
colors = sns.color_palette('rocket', 21)
levels = np.linspace(-1, 1, 21)
cmap_plot, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")
sns.heatmap(corr_matrix, mask=mask, annot=True, ax = ax[1], 
            cmap = cmap_plot, norm = norm, annot_kws={"size": 15, "color": 'black'})
ax[1].hlines([0, 1, 2, 3, 4, 5], *ax[1].get_xlim(), color = 'black')
ax[1].vlines([0, 1, 2, 3, 4, 5], *ax[1].get_ylim(), color = 'black')
ax[1].xaxis.set_ticks_position('bottom')
ax[1].set_title('Correlation matrix', fontsize = 20)
ax[1].tick_params(axis='both', which='major', labelsize=14)
ax[1].tick_params(axis='both', which='minor', labelsize=14)
ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation = 35, fontsize = 15, color = 'black')
ax[1].set_yticklabels(ax[1].get_yticklabels(), rotation = 35, fontsize = 15, color = 'black')
ax[1].xaxis.label.set_size(14)

circle_rad = 25  # This is the radius, in points
ax[1].plot(0.5, 2.5, 'o',
        ms=circle_rad * 2, mec='w', mfc='none', mew=4)


In [None]:
train['n_letters_excerpt'] = train['excerpt'].str.replace("[^a-zA-Z0-9-]", "").str.len()
train['n_words_excerpt'] = train['excerpt'].str.split().str.len()
train['n_sentences_excerpt'] = train['excerpt'].str.split(pat = '[.!?]+').str.len()

In [None]:
?px.scatter_3d

In [None]:
import plotly.express as px
fig = px.scatter_3d(train.assign(size=2), x='n_words_excerpt', y='number_of_verbs', z='target', size = 'size', color = 'target', opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()