In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import OrdinalEncoder # ordinal encoding categorical variables
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error #MSE metric

import matplotlib.pyplot as plt
import seaborn as sns

SEED = 91 #random seed

pd.set_option('display.max_columns',500)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Load Data

In [None]:
df_train = pd.read_csv("../input/30daysofml/train.csv",index_col=0)
df_test = pd.read_csv("../input/30daysofml/test.csv",index_col=0)

df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

## 2. EDA

In [None]:
CAT_FEATURES = ['cat0','cat1','cat2','cat3','cat4','cat5',
                'cat6','cat7','cat8','cat9']
NUM_FEATURES = ['cont0','cont1','cont2','cont3','cont4',
               'cont5','cont6','cont7','cont8','cont9',
               'cont10','cont11','cont12','cont13']
ALL_FEATURES = [CAT_FEATURES+NUM_FEATURES]

### Categorical Features

In [None]:
# Train Data
plt.figure(figsize=(20,8))
plt.subplots_adjust(hspace=0.5, wspace=0.5)

for i, col in enumerate(CAT_FEATURES):
    plt.subplot(2,5,i+1)
    sns.barplot(x=col, y="target",data=df_train,
               estimator=lambda x:len(x)/len(df_train)*100,
               order=np.sort(df_train[col].unique()))
    plt.title(col)

plt.show()


In [None]:
# Test Data
plt.figure(figsize=(20,8))
plt.subplots_adjust(hspace=0.5,wspace=0.5)
for i, col in enumerate(CAT_FEATURES):
    plt.subplot(2,5,i+1)
    sns.barplot(x=col, y="cont0",data=df_test,
               estimator=lambda x:len(x)/len(df_test)*100,
               order=np.sort(df_test[col].unique()))
    plt.title(col)
plt.show()


### Continuous features

In [None]:
df_train[NUM_FEATURES].describe()

In [None]:
sns.pairplot(df_train[[
    *NUM_FEATURES[7:],
    'target'
]])

### Target

In [None]:
df_train['target'].describe()

## 3. Data Preprocessing

In [None]:
X = df_train.drop(['target'], axis =1)
y = df_train['target']

X_test = df_test.copy()

X.head()

#### Handling categorical data using Ordinal Encoder

In [None]:
ordinal_encoder = OrdinalEncoder()
X[CAT_FEATURES] = ordinal_encoder.fit_transform(X[CAT_FEATURES])
X_test[CAT_FEATURES] = ordinal_encoder.transform(X_test[CAT_FEATURES])

X.head()

### 4. Modelling using XGBRegressor

In [None]:
xgb_params = {
    'booster':'gbtree',
    'n_estimators':10000,
    'learning_rate':0.05,
    'reg_lambda':10,
    'reg_alpha':26,
    'subsample':0.9,
    'colsample_bytree':0.12,
    'max_depth':3,
    'random_state':SEED
}

### 5. Prediction and Evaluation of RMSE

In [None]:
N_FOLD = 7

# Set the kfold parameters
kf = KFold(n_splits=N_FOLD, shuffle=True, random_state=SEED)

# Create an array of zeros to store "out of fold" predictions
oof_preds = np.zeros((X.shape[0],))
predictions = 0
model_fi = 0
mean_rmse = 0

# Generate folds and making training and prediction for each folds
for num, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = 'rmse',
             early_stopping_rounds = 50)
    
    # Mean of predictions
    predictions += model.predict(X_test) / N_FOLD
    
    # Mean of feature importance
    model_fi += model.feature_importances_ / N_FOLD
    
    # Out of Fold predictions
    oof_preds[valid_idx] = model.predict(X_valid)
    
    # Getting score for a fold model
    fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_idx]))
    print(f"Fold {num} | RMSE: {fold_rmse}")
          
    mean_rmse += fold_rmse / N_FOLD

print(f"\nOverall RMSE: {mean_rmse}")

### 6. Submitting Predictions

In [None]:
output = pd.DataFrame({'Id': df_test.index,
                       'target': predictions})

output.to_csv('submission.csv', index=False)