In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')

In [None]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [None]:
train_df['words'] = train_df['excerpt'].apply(lambda x: len(x.split(' ')))
test_df['words'] = test_df['excerpt'].apply(lambda x: len(x.split(' ')))

In [None]:
train_df['sentences'] = train_df['excerpt'].apply(lambda x: x.count('.'))
test_df['sentences'] = test_df['excerpt'].apply(lambda x: x.count('.'))

In [None]:
train_df['syllables'] = train_df['excerpt'].apply(lambda x: syllable_count(x))
test_df['syllables'] = test_df['excerpt'].apply(lambda x: syllable_count(x))

In [None]:
train_df['flesch_score'] = 206.835 - 1.015 * (train_df['words']/train_df['sentences']) - 84.6 * (train_df['syllables']/train_df['words'])
test_df['flesch_score'] = 206.835 - 1.015 * (test_df['words']/test_df['sentences']) - 84.6 * (test_df['syllables']/test_df['words'])

In [None]:
train_df['flesch_score2'] = 0.39 * (train_df['words']/train_df['sentences'])  + 11.8 * (train_df['syllables']/train_df['words']) - 15.59
test_df['flesch_score2'] = 0.39 * (test_df['words']/test_df['sentences'])  + 11.8 * (test_df['syllables']/test_df['words']) - 15.59

In [None]:
train_df['unique_words'] = train_df['excerpt'].apply(lambda x: len(set(x.split(' '))))
test_df['unique_words'] = test_df['excerpt'].apply(lambda x: len(set(x.split(' '))))

In [None]:
train_df['diversity'] = train_df['unique_words']/train_df['words']
test_df['diversity'] = test_df['unique_words']/test_df['words']

In [None]:
train_df['characters'] = train_df['excerpt'].apply(lambda x: len(x))
test_df['characters'] = test_df['excerpt'].apply(lambda x: len(x))

train_df['w/c'] = train_df['words']/train_df['characters']
test_df['w/c'] = test_df['words']/test_df['characters']

In [None]:
features = ['words','sentences','syllables','flesch_score','flesch_score2','unique_words','diversity','characters','w/c']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [None]:
val = np.zeros((train_df.shape[0],1))
test = np.zeros((test_df.shape[0],1))

In [None]:
fold = KFold(n_splits = 10)

In [None]:
for train_index,test_index in fold.split(train_df):
    x0, x1 = train_df[features].loc[train_index], train_df[features].loc[test_index]
    y0, y1 = train_df['target'][train_index], train_df['target'][test_index]
    
    model = Ridge(alpha = 1)

    model.fit(x0,y0)
    
    ypred = model.predict(x1)
    val[test_index,0] =  model.predict(x1)
    test[:,0] += model.predict(test_df[features])/10
    
    print(np.round( np.sqrt(mse(val[test_index,0], y1)),2 ))
print(np.round( np.sqrt(mse(val, train_df['target'])) , 3))

In [None]:
res = test_df[["id"]].copy()
res["target"] = test
res.to_csv('submission.csv', index = False)