In [None]:
# This is a simple notebook with 2 features and Linear Regression.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import spacy
import xgboost as xgb

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read training data
df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')

In [None]:
sns.displot(df['standard_error'])

In [None]:
# df = df[df['standard_error'] < 0.55]

In [None]:
#nlp = spacy.load('en_core_web_lg')
#with nlp.disable_pipes():
#    train_vectors = np.array([nlp(text).vector for text in df.excerpt])
    
#train_vectors.shape

In [None]:
# Basic functions to analyse a sentence
def syllable_count(word):
    count = 0
    vowels = "aeiouy"
    word = word.lower().strip(".:;?!")
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def sentence_count(sentence):
    count = sentence.count('.')
    count += sentence.count('!')
    count += sentence.count('?')
    return count

def word_count(sentence):
    count = sentence.count(' ')
    return count

def character_count(sentence):
    count = len(sentence)
    return count

In [None]:
df['character_count'] = df['excerpt'].apply(lambda x: character_count(x))
df['syllable_count'] = df['excerpt'].apply(lambda x: syllable_count(x))
df['word_count'] = df['excerpt'].apply(lambda x: word_count(x))
df['sentence_count'] = df['excerpt'].apply(lambda x: sentence_count(x))

In [None]:
def extract_scores(text):
#    r = Readability(str(text))
    return 1, 2, 3, 4, 5, 6, 7

# df['flesch'], df['gunning_fog'], df['coleman_liau'], df['dale_chall'], \
#                                 df['ari'], df['linsear_write'], df['spache'] \
#                                 = zip(*df['excerpt'].map(lambda x: extract_scores(x)))

# Calculate the features
df['flesch'] = df.apply(lambda x: 206.835 - 1.015 * (x['word_count']/x['sentence_count']) - 84.6 * (x['syllable_count']/x['word_count']), axis=1)
df['ari'] = df.apply(lambda x: 4.71 * (x['character_count']/x['word_count']) + 0.5 * (x['word_count']/x['sentence_count']) - 21.43, axis=1)

In [None]:
# Putting feature variable to X
# x_cols = ['flesch', 'gunning_fog', 'coleman_liau', 'dale_chall', 'ari', 'linsear_write', 'spache'] 
x_cols = ['flesch', 'ari']

#Scale it
scaler = StandardScaler()
df[x_cols] = scaler.fit_transform(df[x_cols])

X_train = df[x_cols]

# Putting response variable to y
y_train = df['target']
lm = LinearRegression()
lm.fit(X_train, y_train)

df['predict'] = lm.predict(X_train)

# print(df.target, df.predict)

In [None]:
mse = mean_squared_error(df.target, df.predict)
r_squared = r2_score(df.target, df.predict)
print(mse, r_squared)

In [None]:
# Read test file
dftest = pd.read_csv('/kaggle/input/commonlitreadabilityprize/test.csv')


In [None]:
# Calculate variables
dftest['syllable_count'] = dftest['excerpt'].apply(lambda x: syllable_count(x))
dftest['word_count'] = dftest['excerpt'].apply(lambda x: word_count(x))
dftest['sentence_count'] = dftest['excerpt'].apply(lambda x: sentence_count(x))

In [None]:
#dftest['flesch'], dftest['gunning_fog'], dftest['coleman_liau'], dftest['dale_chall'], \
#                                 dftest['ari'], dftest['linsear_write'], dftest['spache'] \
#                                 = zip(*dftest['excerpt'].map(lambda x: extract_scores(x)))
# Calculate features
dftest['flesch'] = dftest.apply(lambda x: 206.835 - 1.015 * (x['word_count']/x['sentence_count']) - 84.6 * (x['syllable_count']/x['word_count']), axis=1)
dftest['ari'] = df.apply(lambda x: 4.71 * (x['character_count']/x['word_count']) + 0.5 * (x['word_count']/x['sentence_count']) - 21.43, axis=1)

In [None]:
scaler = StandardScaler()
dftest[x_cols] = scaler.fit_transform(dftest[x_cols])

### Linear Regression

In [None]:
# X_test = dftest[x_cols]
# dftest['predict'] = lm.predict(X_test)

### XGBoost

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)

In [None]:
X_test = dftest[x_cols]
dftest['predict'] = xgb.predict(X_test)

In [None]:
# Format the results
submission = pd.DataFrame()
submission['id'] = dftest['id']
submission['target'] = dftest['predict']

In [None]:
# Write the results
submission.to_csv('submission.csv', index=False)