In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from sklearn.model_selection import GridSearchCV
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.svm import SVC
import category_encoders as ce
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Let's see what we have in our imported directory
!pip install --no-index ../input/textstat-070/wheelhouse/Pyphen-0.10.0-py3-none-any.whl
!pip install --no-index ../input/textstat-070/wheelhouse/textstat-0.7.0-py3-none-any.whl

In [None]:
import textstat

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
train_df.head()

In [None]:
train_df[train_df['id'] =='436ce79fe']


In [None]:
sns.jointplot(data= train_df[train_df['standard_error']!=0],
    x='target', 
    y='standard_error', 
    height=8,
    color = 'green'

)
plt.suptitle("Target vs Standard error ", size=15)
plt.subplots_adjust(top=0.95)
plt.show()

* Passages with medium difficulty level have lower spread compared to text with extreme difficulty /ease level

* further dissection can be done by analyzing no. of excerpts in various (bins) intervals. We can check this by looking at histograms. We can use the standard error as a categorical feature if eda gives us some conclusive results


In [None]:
sns.histplot(data=train_df, x="standard_error", color="lime", \
            binwidth = 0.05, kde = True)

In [None]:
def apply_spread_to_categorical(row):
    spread_val = row['standard_error']
    if 0.4 < spread_val <=0.45:
        cat= 'low'
        
    elif 0.45 < spread_val <=0.50:
        cat = 'medium'
        
    elif 0.50 < spread_val <=0.55:
        cat = 'high'
        
    else:
        cat = 'very high'
    return cat

data = train_df[train_df['id']!= '436ce79fe']
# data['spread_category'] = data.apply(lambda x: apply_spread_to_categorical(x), axis = 1)
# sns.scatterplot(data=data, x="target", y="standard_error", hue="spread_category")

Not much can be concluded from the above graph to make a definite conclusion, but we can see that
* spread_cat 'very high' occurs mostly betwen the target range < -2 and > 0.
* spread_cat 'high' is also dense around <-1 and > 0

Let's try to create some features and study their relationship with target. We will create features on basis of readability formula. Some of the most common ones:

1. The Dale–Chall formula
2. The Gunning fog formula
3. Fry readability graph
4. McLaughlin’s SMOG formula
5. The FORCAST formula
6. Readability and newspaper readership
7. Flesch Scores
8. Automated Readability Index


Instead of coding, we'll use the textstat. Textstat is an easy to use library to calculate statistics from text. It helps determine readability, complexity, and grade level.

In [None]:
def apply_flesch_reading_ease(row):
    score = textstat.flesch_reading_ease(row['excerpt'])
    return score
def apply_smog_index(row):
    score = textstat.smog_index(row['excerpt'])
    return score
def apply_flesch_kincaid_grade(row):
    score = textstat.flesch_kincaid_grade(row['excerpt'])
    return score
def apply_gunning_fog(row):
    score = textstat.gunning_fog(row['excerpt'])
    return score
def apply_dale_chall_readability_score(row):
    score = textstat.dale_chall_readability_score(row['excerpt'])
    return score

In [None]:
data['readability_flesch_ease'] = data.apply(lambda x:apply_flesch_reading_ease(x), axis = 1 )
data['readability_smog'] = data.apply(lambda x:apply_smog_index(x), axis = 1 )
data['readability_flesch_kincaid'] = data.apply(lambda x:apply_flesch_kincaid_grade(x), axis = 1 )

data['readability_gunning_fog'] = data.apply(lambda x:apply_gunning_fog(x), axis = 1 )
data['readability_dale_chall'] = data.apply(lambda x:apply_dale_chall_readability_score(x), axis = 1 )
data.head()
data = data.drop(['url_legal', 'license', 'standard_error'], axis =1)
data.head()

In [None]:
g = sns.PairGrid(data, y_vars=["target"], x_vars=["readability_flesch_ease", "readability_smog", "readability_flesch_kincaid",\
                 "readability_gunning_fog", "readability_dale_chall" ], height=4)
g.map(sns.regplot, color="blue")
g.set(ylim=(-5, 5), yticks=[-3, -2, -1, 0 , 1, 2])

* We have create new columns using traditional readability formula, let's standardize these columns before any further eda


In [None]:
scaler = StandardScaler()

def apply_standard_scaler(df, columns):
    for column in columns:
        df[column] = scaler.fit_transform(df[column].to_numpy().reshape(-1, 1))
    return df


readibility_columns = [x for x in data.columns if 'readability_' in x]
transformed_df = apply_standard_scaler(data, readibility_columns)
transformed_df = transformed_df.drop(['excerpt',  'id'], axis = 1)
transformed_df.head()

In [None]:
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(transformed_df.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

* Using the information available from correlation matrix, we 'll try to remove some features whose correlation coeff. cross a certain value. We have kept this as 0.9 currently. we can see corr. b/w gunning_fog and flesch_kincaid is 0.98, we remove 1 of them.

In [None]:
columns = np.full((transformed_df.corr().shape[0],), True, dtype=bool)
for i in range(transformed_df.corr().shape[0]):
    for j in range(i+1, transformed_df.corr().shape[0]):
        if transformed_df.corr().iloc[i,j] >= 1.0:
            if columns[j]:
                columns[j] = False

#selected columns after correlation matrix filtering              
selected_columns = transformed_df.columns[columns]

#concat the current df along with non-integer columns that were removed before creating the matrix
feature_df = pd.concat([data[['id', 'excerpt']], transformed_df[selected_columns]], axis=1)
feature_df

In [None]:
# """
# one hot encode the categorical variable spread_category
# """
# encoded_data =pd.get_dummies(data=feature_df[['spread_category']],drop_first=True)
# feature_df_after_encoding = pd.concat([feature_df, encoded_data], axis = 1)
# feature_df_after_encoding= feature_df_after_encoding.drop(['id', 'excerpt', 'spread_category', 'standard_error'], axis = 1)
# feature_df_after_encoding.head()

In [None]:
"""
generate train and test data
"""
feature_df = feature_df.drop(['id', 'excerpt'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(feature_df.drop(columns='target'), feature_df['target'].values, random_state=42,test_size=0.20)
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))


In [None]:
gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train,y_train,eval_metric='mse')
pred_y = gbm.predict(X_test)
print(f' Test RMSE using basic features {round(np.sqrt(mean_squared_error(y_test,pred_y)),4)}')

In [None]:
"""
apply tansformations to test data(submission purposes)
"""
def apply_test_data_transformations(data):
#     data['spread_category'] = data.apply(lambda x: apply_spread_to_categorical(x), axis = 1)
    data['readability_flesch_ease'] = data.apply(lambda x:apply_flesch_reading_ease(x), axis = 1 )
    data['readability_smog'] = data.apply(lambda x:apply_smog_index(x), axis = 1 )
    data['readability_flesch_kincaid'] = data.apply(lambda x:apply_flesch_kincaid_grade(x), axis = 1 )
    data['readability_gunning_fog'] = data.apply(lambda x:apply_gunning_fog(x), axis = 1 )
    data['readability_dale_chall'] = data.apply(lambda x:apply_dale_chall_readability_score(x), axis = 1 )
    data = data.drop(['url_legal', 'license'], axis =1)
    
    readibility_columns = [x for x in data.columns if 'readability_' in x]
    transformed_df = apply_standard_scaler(data, readibility_columns)
    transformed_df = transformed_df.drop(['excerpt', 'id'], axis = 1)
    transformed_df.head()

    
    columns = np.full((transformed_df.corr().shape[0],), True, dtype=bool)
    for i in range(transformed_df.corr().shape[0]):
        for j in range(i+1, transformed_df.corr().shape[0]):
            if transformed_df.corr().iloc[i,j] >= 1.0:
                if columns[j]:
                    columns[j] = False

    #selected columns after correlation matrix filtering              
    selected_columns = transformed_df.columns[columns]

    #concat the current df along with non-integer columns that were removed before creating the matrix
    feature_df = pd.concat([data[['id', 'excerpt']], transformed_df[selected_columns]], axis=1)
    
#     """
#     one hot encode the categorical variable spread_category
#     """
#     encoded_data =pd.get_dummies(data=feature_df[['spread_category']],drop_first=True)
#     feature_df_after_encoding = pd.concat([feature_df, encoded_data], axis = 1)
#     feature_df_after_encoding= feature_df_after_encoding.drop(['id', 'excerpt', 'spread_category', 'standard_error'], axis = 1)
    
    return feature_df
    

In [None]:
test_data = apply_test_data_transformations(test_df)
X_test = test_data.drop(columns=[ 'id', 'excerpt'], axis = 1)


X_train, y_train  = feature_df.drop(columns='target'), feature_df['target'].values

In [None]:
gbm = lgb.LGBMRegressor(random_state=42)
gbm.fit(X_train,y_train,eval_metric='mse')
pred_y = gbm.predict(X_test)

In [None]:
test_df['target'] = pred_y
test_df[['id', 'target']].to_csv('submission.csv', index=False)

In [None]:
# #Set the minimum error arbitrarily large
# min = 99999999999999
# count = 0 #Used for keeping track of the iteration number
# #How many runs to perform using randomly selected hyperparameters
# iterations = 10
# for i in range(iterations):
#     print('iteration number', count)
#     count += 1 #increment count
# #     try:
#     d_train = lgb.Dataset(X_train, label=y_train) #Load in data
#     params = {} #initialize parameters
#     params['learning_rate'] = np.random.uniform(0, 1)
#     params['boosting_type'] = np.random.choice(['gbdt', 'dart', 'goss'])
#     params['objective'] = 'regression'
#     params['metric'] = 'mse'
#     params['sub_feature'] = np.random.uniform(0, 1)
#     params['num_leaves'] = np.random.randint(20, 300)
#     params['min_data'] = np.random.randint(10, 100)
#     params['max_depth'] = np.random.randint(5, 200)
#     iterations = np.random.randint(10, 10000)
#     print(params, iterations)
#     #Train using selected parameters
#     clf = lgb.train(params, d_train, iterations)
#     y_pred=clf.predict(X_test) #Create predictions on test set
#     mse= round(np.sqrt(mean_squared_error(y_test,pred_y)),4)
#     print('MSE:', mse)
#     if mse < min:
#         min = mse
#         pp = params 
# #     except: #in case something goes wrong
# #         print('failed with')
# #         print(params)
#     print("*" * 50)
#     print('Minimum is: ', min)
# #     print('Used params', pp)