In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import nltk

# Load Train Data

In [None]:
df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")
df.head()

In [None]:
df.shape

# Generate simple Stats
## Sentence length

In [None]:
def sentence_len(text):
    sentences = nltk.tokenize.sent_tokenize(text)
    sent_len = [len(nltk.tokenize.word_tokenize(s)) for s in sentences]
    return pd.Series({"sent_len_mean": np.mean(sent_len), "sen_len_std": np.std(sent_len), "sent_len_max": max(sent_len)})

In [None]:
df[["sent_len_mean", "sent_len_std", "sent_len_max"]] = df["excerpt"].apply(sentence_len)

## Vocabulary size
After removing stop words

In [None]:
remove_words = nltk.corpus.stopwords.words('english')
remove_words.extend([".", ",", "!", "?", "'", ":", ";", '"', "-"])

def vocab_size(text):
    words = nltk.tokenize.word_tokenize(text)
    words_clean = [word for word in words if word not in remove_words]
    return len(set(words_clean))

In [None]:
df["vocab_size"] = df.excerpt.apply(vocab_size)

## Commata count

In [None]:
def commata_count(text):
    return text.count(",")

In [None]:
df["commata_count"] = df.excerpt.apply(commata_count)

In [None]:
df.head()

## Text length

In [None]:
def text_len(text):
    return len(nltk.tokenize.word_tokenize(text))

In [None]:
df["text_len"] = df.excerpt.apply(text_len)

# Correlation analysis

In [None]:
df.corr()["target"]

In [None]:
df.corr()

The sent_len features as well as vocab_size seem to be good features. However, vocab_size and commata_count are to some extent dependent on text_len. Therefore we could normalize those two features:

In [None]:
def normalize_vocab_commata(row):
    return pd.Series({"vocab_size_norm": row["vocab_size"]/row["text_len"], "commata_count_norm": row["commata_count"]/row["text_len"]})

In [None]:
df[["vocab_size_norm", "commata_count_norm"]] = df.apply(normalize_vocab_commata, axis=1)

In [None]:
df.corr()["target"]

The normalization lead to a higher correlation for the vocab_size, but did not change much for commata_count.

# Preparing train/test-data

In [None]:
features = ['sent_len_mean', 'sent_len_std', 'sent_len_max', 'vocab_size',
       'commata_count', 'text_len', 'vocab_size_norm', 'commata_count_norm']

In [None]:
x = df[features].values
y = df["target"]

# Testing Models

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
kf = KFold(n_splits=10, shuffle=True)

In [None]:
def test_model(model_cal):
    rmse_train = []
    rmse_test = []
    for train_index, test_index in kf.split(x):
        X_train, X_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = model_cal()
        model.fit(X_train, y_train)
        rmse_train.append(rmse(y_train, model.predict(X_train)))
        rmse_test.append(rmse(y_test, model.predict(X_test)))
    return np.mean(rmse_test), rmse_test

def print_results(rmse):
    print("Mean test rmse = {:5.4f}\nSTD = {}".format(np.mean(rmse), np.std(rmse)))

## Linear regression

In [None]:
rmse_test_mean_LR, rmse_test_LR = test_model(LinearRegression)
print_results(rmse_test_LR)

## SVM

In [None]:
rmse_test_mean_SVM, rmse_test_SVM = test_model(SVR)
print_results(rmse_test_SVM)

## Random Forest

In [None]:
rmse_test_mean_RF, rmse_test_RF = test_model(RandomForestRegressor)
print_results(rmse_test_RF)

LR and RF seem to similarily. Of those, RF seems so achieve more consistent results (lower standard deviation of RMSE). SVM seems to be slightly better.

# Train one SVM-Model

In [None]:
model = SVR()
model.fit(x, y)

# Prepare test data

In [None]:
test = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
test.head()

In [None]:
test[["sent_len_mean", "sent_len_std", "sent_len_max"]] = test.excerpt.apply(sentence_len)
test["vocab_size"] = test.excerpt.apply(vocab_size)
test["commata_count"] = test.excerpt.apply(commata_count)
test["text_len"] = test.excerpt.apply(text_len)
test[["vocab_size_norm", "commata_count_norm"]] = test.apply(normalize_vocab_commata, axis=1)

In [None]:
for col in features:
    if col not in test.columns:
        print("Forgot to compute feature {}".format(col))

In [None]:
test["target"] = model.predict(test[features])

In [None]:
test[["id", "target"]]

In [None]:
test[["id", "target"]].to_csv("submission.csv", index=False)