In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv')
train

In [None]:
# Dropping the 'id' column
train.drop(['id'], axis = 1, inplace = True)
test.drop(['id'], axis = 1, inplace = True)

In [None]:
# Check any missing value
train.info()

In [None]:
X_train = train.drop(['target'], axis = 1)
y_train = train['target']
X_test = test.copy()

## Target Distribution

In [None]:
train['target'].describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.displot(y_train)
plt.show()

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Utility Functions

In [None]:
from sklearn.model_selection import cross_val_score

def get_score(model, X = X_train, y = y_train, cv = 5):
    scores = cross_val_score(model, X, y, cv = cv, n_jobs = -1, 
                             scoring = 'neg_mean_squared_error')
    acc = np.sqrt(-scores)
    return acc.mean()

In [None]:
from sklearn.model_selection import learning_curve

def plot_model(model, X = X_train, y = y_train, cv = 5):
    sizes, train_scores, val_scores = learning_curve(model, X, y, cv = cv, 
                                                     scoring = 'neg_mean_squared_error')
    
    mean_train_score = np.mean(np.sqrt(-train_scores), axis = 1)
    mean_val_score = np.mean(np.sqrt(-val_scores), axis = 1)
    
    plt.plot(sizes, mean_train_score, 'bo--',  label = 'Training score')
    plt.plot(sizes, mean_val_score, 'go-', label = 'Cross-validation score')
    
    plt.title('Learning curve for ' + str(model).split('(')[0])
    plt.xlabel('Training Set Size')
    plt.ylabel('RMSE Score')
    plt.legend(loc = 'best')
    plt.grid()
    plt.show()

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
print(get_score(lin_reg))
plot_model(lin_reg)

## ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet

elasticnet_reg = ElasticNet()
print(get_score(elasticnet_reg))
plot_model(elasticnet_reg)

## LightGBM Regressor

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

split = KFold(n_splits = 5)
for train_index, test_index in split.split(X_train):
    X_train_new, X_test_new = X_train[train_index], X_train[test_index]
    y_train_new, y_test_new = y_train[train_index], y_train[test_index]
    
    params = {
        'boosting_type': 'gbdt', 
        'objective': 'regression',
        'metric': 'RMSE',
        'learning_rate': '0.05',
        'n_jobs': -1, 
    }
    
    lgbm_reg = LGBMRegressor(**params)
    lgbm_reg.fit(X_train_new, y_train_new)
    y_pred = lgbm_reg.predict(X_test_new)

    score = mean_squared_error(y_test_new, y_pred, squared = False)
    
score.mean()

In [None]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'regression',
    'metric': 'RMSE',
    'learning_rate': '0.05',
    'n_jobs': -1, 
}
    
model = LGBMRegressor(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Submission

In [None]:
submission['target'] = y_pred
submission.to_csv('my_submission.csv', index = False)