# EDA

## Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Import Data

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
train.head()

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
test.head()

## Handle Missing Values

In [None]:
print(train.isnull().sum())
print("-"*80)
print(test.isnull().sum())

In [None]:
train.describe()

In [None]:
train.drop(['id'], axis = 1, inplace = True)
test.drop(['id'], axis =1, inplace=True)

In [None]:
train.head()

## Check Distribution of All Features

In [None]:
## Check distribution of target feature

sns.set_style("dark")
sns.set_color_codes(palette='deep')
f,ax = plt.subplots(figsize = (9,8))

sns.distplot(train['target'], color = 'b')

ax.xaxis.grid(False)
ax.set(ylabel = 'values')
ax.set(xlabel = 'target')
ax.set(title = 'Target Distribution')
plt.show()

In [None]:
features = [f'cont{x}' for x in range (1,15)]
print(features)

In [None]:
## Check distribution of independent features
i=1
plt.figure()

fig, ax = plt.subplots(5, 3, figsize = (14, 24))

for feature in features:
    plt.subplot(5,3,i)
    sns.distplot(train[feature], color = 'blue', kde = True, bins = 120, label = 'train')
    sns.distplot(test[feature], color = 'green', kde = True, bins = 120, label = 'test')
    plt.xlabel(feature, fontsize = 9); plt.legend()
    i=i+1
plt.show()

## Check Correlation of All Features

In [None]:
train.corr()

In [None]:
corr = train.corr()

plt.subplots(figsize = (14, 10))
sns.heatmap(corr, vmax = 0.9, cmap = 'viridis', square = True)

## Find and Remove Outliers

In [None]:
for col in train.columns[:-1]:
    plt.boxplot([train[col], test[col]], labels = ['train', 'test'])
    plt.title(col)
    plt.legend()
    plt.show()

In [None]:
plt.boxplot(train['target'])

In [None]:
def replace_outliers(data):
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3-Q1
        median_ = data[col].median()
        data.loc[((data[col]< Q1-1.5*IQR) | (data[col]> Q3+1.5*IQR)), col] = median_
    return data

In [None]:
train = replace_outliers(train)
test = replace_outliers(test)

In [None]:
for col in train.columns[:-1]:
    plt.boxplot([train[col], test[col]], labels = ['train', 'test'])
    plt.title(col)
    plt.legend()
    plt.show()

In [None]:
## Check distribution of target feature

sns.set_style("dark")
sns.set_color_codes(palette='deep')
f,ax = plt.subplots(figsize = (9,8))

sns.distplot(train['target'], color = 'b')

ax.xaxis.grid(False)
ax.set(ylabel = 'values')
ax.set(xlabel = 'target')
ax.set(title = 'Target Distribution')
plt.show()

# Building the Model

## Split our Data into Training Set and Test Set

In [None]:
X = train.drop(['target'], axis = 1)
y = train.target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.18, random_state = 33)

In [None]:
X_train

## Grid Search for LGB model

In [None]:
#import lightgbm as lgb
#from sklearn.model_selection import GridSearchCV
#parameters = {
#              'depeth'       :    [4,6,8,10,12,14,18,20],
#              'learning_rate':    [0.005, 0.01, 0.035, 0.05, 0.1, 0.15, 0.2],
#              'iterations'   :    [300, 800, 1000, 1800, 3000, 4100, 5000]
#             }
#LGB = lgb.LGBMRegressor()

#grid = GridSearchCV(estimator = LGB, param_grid = parameters, cv = 3, n_jobs = -1)
#grid.fit(X,y)
#print("Results from Grid Search")
#print("\n The best estimator across All search params:\n", grid.best_estimator_)
#print("\n The best score across All search params:\n", grid.best_score_)
#print("\n The best parameters across All search params:\n", grid.best_params_)

## Train Model on the Training Data

In [None]:
import lightgbm as lgb
LGB = lgb.LGBMRegressor(random_state=33, n_estimators=4800, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.074622455507616e-05, lambda_l2= 2.0521330798729704e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.82063411)

LGB.fit(X_train, y_train)

## Make Prediction

In [None]:
y_pred =LGB.predict(X_test)

## Check the Accuracy of the Trained Model

In [None]:
from sklearn.metrics import mean_squared_error
rmse_LGB = np.sqrt(mean_squared_error(y_test, y_pred))
rmse_LGB

## Predict the Results for Test Data

In [None]:
pred_LGB = LGB.predict(test)
pred_LGB

## Save Results to a CSV File

In [None]:
output = pd.DataFrame({'id': test_data.id, 'target': pred_LGB})
output.to_csv('Kaggle_Playground_Submission_25_01_Trial5.csv', index=False)