# Importing Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression

# Loading and Data

In [None]:
data_dir = '/kaggle/input/tabular-playground-series-jan-2021'
train_path = os.path.join(data_dir, 'train.csv')
test_path = os.path.join(data_dir, 'test.csv')
sam_sub_path = os.path.join(data_dir, 'sample_submission.csv')

df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sam_sub_df = pd.read_csv(sam_sub_path)

In [None]:
df

# Brief EDA and  Motivation

In [None]:
features_names = df.drop(['id', 'target'], axis=1).columns.values
features_names

## Distribution chart and outliers

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df['target'], kde=True)
plt.show()

In [None]:
len(df[df['target'] <= 5])

In [None]:
# remove outliers (points less than 5)
df.drop(df[df['target'] <= 5].index, axis=0, inplace=True)
plt.figure(figsize=(10, 6))
sns.distplot(df['target'], kde=True)
plt.show()

## Correlation matrix and multicollinearity problem

In [None]:
def show_correlation_matrix(correlation_matrix):
    f, ax = plt.subplots(figsize=(11, 9))
    sns.heatmap(
        correlation_matrix,
        cmap='YlGnBu',
        linewidths=.5,
        cbar_kws={"shrink": .5},
        square=True,
        annot=True
    )
    plt.yticks(rotation=0)
    plt.show()

In [None]:
correlation_matrix = df[features_names].corr()
show_correlation_matrix(correlation_matrix)

As we can see below, the correlation matrix has almost zero determinant. It means that there is [multicollinearity problem](https://en.wikipedia.org/wiki/Multicollinearity). In this case we can try to use [regularization methods](https://en.wikipedia.org/wiki/Regularization_(mathematics)) like [Ridge](https://en.wikipedia.org/wiki/Tikhonov_regularization) and [Lasso](https://en.wikipedia.org/wiki/Lasso_(statistics)).

In [None]:
np.linalg.det(correlation_matrix)

In [None]:
u, s, v = np.linalg.svd(correlation_matrix)
sigmas = pd.DataFrame(data=s, columns=['singular values'])
sigmas.T

# Data Preprocessing

In [None]:
scaler = StandardScaler()
scaler.fit(df[features_names])

target = df['target'].values
data = scaler.transform(df[features_names])
test = scaler.transform(test_df.drop('id', axis=1).values)


X_train, X_val, y_train, y_val = train_test_split(data, target, test_size=0.3, random_state=17, shuffle=False)

# Optimizing the alpha parameter

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=17)

alphas = np.logspace(-3, -1, 30)
lasso_scores = np.array(
    [cross_val_score(Lasso(alpha), X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error').mean() 
     for alpha in alphas]
)
ridge_scores = np.array(
    [cross_val_score(Ridge(alpha), X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error').mean() 
     for alpha in alphas]
)

plt.plot(alphas, lasso_scores, label='Lasso')
plt.plot(alphas, ridge_scores, label='Ridge')

plt.xlabel('alpha')
plt.ylabel('Negative RMSE')
plt.show()

In [None]:
best_lasso_alpha = alphas[lasso_scores.argmax()]
best_lasso_alpha

In [None]:
best_ridge_alpha = alphas[ridge_scores.argmax()]
best_ridge_alpha

# Check RMSE on validation set

In [None]:
def print_errors(model):
    train_loss = mean_squared_error(model.predict(X_train), y_train, squared=False)
    val_loss = mean_squared_error(model.predict(X_val), y_val, squared=False)
    print(f'Train RMSE = {train_loss}')
    print(f'Valid RMSE = {val_loss}')

In [None]:
lasso_model = Lasso(
    alpha=best_lasso_alpha
).fit(X_train, y_train)

ridge_model = Ridge(
    alpha=best_ridge_alpha
).fit(X_train, y_train)

In [None]:
print('Ridge:')
print_errors(ridge_model)

In [None]:
print('Lasso:')
print_errors(lasso_model)

# Training final models

In [None]:
lasso_model = Lasso(
    alpha=best_lasso_alpha
).fit(data, target)

ridge_model = Ridge(
    alpha=best_ridge_alpha
).fit(data, target)

# Make Prediction and Save Submission

In [None]:
lasso_submission = pd.DataFrame(data={'id': test_df['id'], 'target': lasso_model.predict(test)})
ridge_submission = pd.DataFrame(data={'id': test_df['id'], 'target': ridge_model.predict(test)})

print((sam_sub_df['id'] == lasso_submission['id']).all())
print((sam_sub_df['id'] == ridge_submission['id']).all())

output_dir=''

lasso_submission.to_csv(os.path.join(output_dir, 'submission_lasso.csv'), index=False)
ridge_submission.to_csv(os.path.join(output_dir, 'submission_ridge.csv'), index=False)