In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

# Checking the datatype of the Columns

In [None]:
train.dtypes

# Inspecting the Dataset

In [None]:
train.info()

In [None]:
train.describe()

# Distributions of Columns

In [None]:
cat_df = train.select_dtypes(include="object")
cat_cols = cat_df.columns
num_df = train.select_dtypes(exclude="object")
num_cols = num_df.columns

In [None]:
fig = plt.figure(figsize=(20, 20))
a, b, c=5, 2, 1
for col in cat_cols:
    plt.subplot(a, b, c)
    sns.countplot(x=col, data = cat_df)
    c += 1
plt.show()

In [None]:
test_cat_df = test.select_dtypes(include="object")
test_cat_cols = test_cat_df.columns
test_num_df = test.select_dtypes(exclude="object")
test_num_cols = test_num_df.columns

In [None]:
fig = plt.figure(figsize=(20, 20))
a, b, c=5, 2, 1
for col in test_cat_cols:
    plt.subplot(a, b, c)
    sns.countplot(x=col, data = test_cat_df)
    c += 1
plt.show()

In [None]:
to_drop = ['cat0', 'cat2', 'cat4', 'cat6', 'cat7']
train = train.drop(to_drop, axis=1)
test = test.drop(to_drop, axis=1)

## We need to encode the categoricals.

There are different strategies to accomplish this, and different approaches will have different performance when using different algorithms. For this starter notebook, we'll use simple encoding.

In [None]:
for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)
        test[c] = lbl.transform(test[c].values)
        
display(train.head())

In [None]:
train.describe()

In [None]:
sns.heatmap(train.corr())

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.90)

In [None]:
model = lgb.LGBMRegressor(objective='rmse', n_estimators=100, max_depth=10)

In [None]:
scores = cross_val_score(model, train, target, cv=10, scoring='neg_mean_squared_error')
print(np.sqrt(-np.mean(scores)))

In [None]:
model.fit(train, target)
submission['target'] = model.predict(test)
submission.to_csv('lgbm.csv')

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(target, model.predict(train)))

## Now you should save your Notebook (blue button in the upper right), and then when that's complete go to the notebook viewer and make a submission to the competition. :-)

## There's lots of room for improvement. What things can you try to get a better score?