# Setup


In [None]:
import numpy as np
import pandas as pd

from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())
train.describe()
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

# Modify the Random Forest Method

Added minimum samples split parameter in random forest method and it made the result better.

In [None]:
model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=False),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=10, n_jobs=-1, min_samples_split=0.01)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

# Try to use Light GBM

Because I'm just completed the course for Intro to Machine Learning, I only know Decision Tree and Random Forest method and I think it is not enough to get error less than 0.7 :)). 
After searching the discussion in the competition's page, I found the LGBM method and I'll try to use the parameter from these page. 
https://medium.com/@imamun/lgbm-and-feature-extraction-ae87fe83ea77
https://www.kaggle.com/shogosuzuki/0-69713-lightgbm-with-small-learning-rate

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_test, y_test)
param = {
    'seed': 2021,
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'feature_pre_filter': False,
    'lambda_l1': 6.540486456085813,
    'lambda_l2': 0.01548480538099245,
    'num_leaves': 256,
    'feature_fraction': 0.52,
    'bagging_fraction': 0.6161835249194311,
    'bagging_freq': 7,
    'min_child_samples': 20,
    'learning_rate' : 0.001,
    'early_stopping_round' : 1000,
    'num_iterations' : 20000
}

lgb_model = lgb.train(param,
                       lgb_train,
                       valid_sets=lgb_valid,
                       num_boost_round=5000,
                       early_stopping_rounds=100)



# Make Submission File

In [None]:
submission['target'] = lgb_model.predict(test)
submission.to_csv('my_submission.csv')

In [None]:
submission.head()