In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')

In [None]:
import seaborn as sns

plt.figure(figsize = (14,6))
sns.heatmap(train.corr(),annot = True)

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

## Plot results

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

## Examples

In [None]:
model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=False),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=50, n_jobs=-1)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

## XGBoost
Read about parameters here: https://xgboost.readthedocs.io/en/latest/parameter.html

In [None]:
from xgboost import XGBRegressor
import xgboost as xgb

In [None]:
regressor = xgb.XGBRegressor(colsample_bytree=0.5,
                 alpha=0.01,
                 reg_lambda=0.003,
                 learning_rate=0.01,
                 max_depth=15,
                 min_child_weight=257,
                 n_estimators=1000, 
                 subsample=0.7,
                 random_state=2020,
                 metric_period=100,
                 silent=1)

regressor.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=1)

y_pred = regressor.predict(X_test)
plot_results("XGBRegressor", y_test, y_pred)


## LightGBM
Read about parameters here: https://lightgbm.readthedocs.io/en/latest/Parameters.html

In [None]:
import lightgbm as lgb

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_valid = lgb.Dataset(X_test, y_test)
param = {
    'seed': 2021,
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'feature_pre_filter': False,
    'lambda_l1': 6.540486456085813,
    'lambda_l2': 0.01548480538099245,
    'num_leaves': 256,
    'feature_fraction': 0.52,
    'bagging_fraction': 0.6161835249194311,
    'bagging_freq': 7,
    'min_child_samples': 20,
    'learning_rate' : 0.001,
    'early_stopping_round' : 1000,
    'num_iterations' : 20000
}

lgb_model = lgb.train(param,
                       lgb_train,
                       valid_sets=lgb_valid,
                       num_boost_round=5000,
                       early_stopping_rounds=100)

plot_results('LightGBM', y_test, y_pred)

## PyTorch

In [None]:
import torch
from torch import nn

In [None]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.head = torch.nn.Linear(inputSize, inputSize//2)
        self.out = torch.nn.Linear(inputSize//2, outputSize)


    def forward(self, x):
        y = self.out(self.head(x))
        return y

In [None]:
model = linearRegression(X_train.shape[-1], 1)

In [None]:
learningRate = 3e-4
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.AdamW(model.parameters(), lr=learningRate)

In [None]:
if torch.cuda.is_available():
    inputs = torch.Tensor(np.array(X_train)).cuda()
    labels = torch.Tensor(np.array(y_train)).cuda()
else:
    inputs = torch.Tensor(np.array(X_train))
    labels = torch.Tensor(np.array(y_train))

In [None]:
for epoch in range(10):

    for i in range(0, len(labels), 64):
        X = inputs[i:i+64]
        target = labels[i:i+64]

        optimizer.zero_grad()
        outputs = model(X).view(len(target))

        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

In [None]:
with torch.no_grad(): # we don't need gradients in the testing phase
    if torch.cuda.is_available():
        predicted = model(torch.Tensor(np.array(X_test)).cuda()).view(len(y_test)).cpu()
    else:
        predicted = model(torch.Tensor(np.array(X_test))).view(len(y_test))
    print(criterion(predicted, torch.Tensor(np.array(y_test))))

# Train

In [None]:
# submission['target'] = regressor.predict(test)
# submission['target'] = lgb_model.predict(test)
submission['target'] = model(torch.Tensor(np.array(test))).view(-1).detach()
submission.to_csv('res.csv')