In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import sklearn
import random
import math
import sys

valid_count = 200

def train_valid_split(df):
    random.seed(42)
    indexes = list(range(len(df)))
    random.shuffle(indexes)
    train = df.iloc[indexes[:-valid_count]]
    valid = df.iloc[indexes[-valid_count:]]
    return train, valid

def rmse_metric(predicted, actual):
    error = sum((b - a) ** 2 for a,b in zip(actual, predicted))
    n = len(actual)
    rmse = math.sqrt(error * 1.0 / n)
    return rmse

## Data

In [None]:
dpath = '/kaggle/input/commonlitreadabilityprize'
train_path = f'{dpath}/train.csv'
test_path = f'{dpath}/test.csv'

In [None]:
# Reading data
df = pd.read_csv(train_path)
df_train, df_valid = train_valid_split(df)
df_test = pd.read_csv(test_path)
len(df_train), len(df_valid), len(df_test)

In [None]:
df_train.head()

In [None]:
# converting it into tfidf vector
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')

# Input
X_train = tfidf.fit_transform(df_train['excerpt'])
X_test = tfidf.transform(df_test['excerpt'])
X_valid = tfidf.transform(df_valid['excerpt'])

# Target
y_train = df_train['target'].values
y_valid = df_valid['target'].values

X_train.shape, X_test.shape, X_valid.shape

In [None]:
list(tfidf.vocabulary_.items())[:100]

## Model

In [None]:
import sklearn.linear_model as lm

In [None]:
def linear_regression(X_train, y_train, X_valid, y_valid):
    models = {
        'lr': lm.LinearRegression(),
        'ridge': lm.Ridge(alpha=0.1, normalize=True),
        'sgd': lm.SGDRegressor(),
        'elasticNet': lm.ElasticNet(alpha=0.1, l1_ratio=0.7, normalize=True),
        'lasso': lm.Lasso(alpha=0.3, normalize=True)
        # 'baysianRidge': lm.BayesianRidge()
    }
    
    min_error_model = None
    min_error = sys.float_info.max
    name = None
    for model, model_obj in models.items():
        
        if model != 'baysianRidge':
            model_obj.fit(X_train, y_train)
        else:
            model_obj.fit(X_train.toarray(), y_train)

        # Validate
        pred = model_obj.predict(X_valid)

        # RMSE Score
        score = rmse_metric(pred, y_valid)
        print(f'model: {model}, validation error: {score}')
        
        # update model
        if min_error_model is None or min_error > score:
            min_error_model = model_obj
            min_error = score
            name = model
        
    return name, min_error_model, min_error

In [None]:
name, model, error = linear_regression(X_train, y_train, X_valid, y_valid)
print('-'*10)
print(f'min error model: {name}, error: {error}')

In [None]:
# Predict and submit score on test data
pred = model.predict(X_test)
pred

## Submisssion

In [None]:
df_test['target'] = pred
df_test

In [None]:
submission = df_test[['id', 'target']]
submission.to_csv("submission.csv", index=False)