In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Outline of This Notebook
1. ### EDA - Exploratory Data Analysis
    1. [Quick Peak](#Quick-Peak)
    1. [Quantitative and Qualitative Data](#Quantitative-and-Qualitative-Data)
    1. [Visualization on Binary Data](#Visualization-on-Binary-Data)
    1. [Missing Values](#Missing-Values)
    
1. ### Data Processing
    1. [Normalization of Dependant Variable - "y"](#Normalization-of-Dependant-Variable---"y")
    1. [Removal of One Unique Values](#Removal-of-One-Unique-Values)
    1. [Eliminating Biased Feature](#Eliminating-Biased-Feature)
    1. [Split The Data Back into Train and Test Dataset](#Split-The-Data-Back-into-Train-and-Test-Dataset)
    
1. ### Model Building
    1. [Train_test_split](#Train_test_split)
    1. [GridSearchCV Best Parameters for Below Models](#GridSearchCV-Best-Parameters-for-Below-Models)
    1. [Evaluation the Optimized Estimators](Evaluation-the-Optimized-Estimators)
    1. [Visualization of Estimator's CV score](Visualization-of-Estimator's-CV-score)
    
    
## [Submission](#Submission)

### Quick Peak

In [None]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore") ## ignore warnings

In [None]:
## import train dataset
df_train = pd.read_csv("../input/mercedes-benz-greener-manufacturing/train.csv.zip")

## Quick peak on train dataset
print("Shape of Train Dataset : {}".format(df_train.shape))
df_train.head()

In [None]:
## import test dataset
df_test = pd.read_csv("../input/mercedes-benz-greener-manufacturing/test.csv.zip")

## Quick peak on test dataset
print("Shape of Test Dataset : {}".format(df_test.shape))
df_test.head()

In [None]:
## Drop ID column
df_train = df_train.drop("ID",axis=1)
df_test = df_test.drop("ID",axis=1)

### Quantitative and Qualitative Data

In [None]:
## Quantitative data
quantitative_data = [feat for feat in df_train if df_train[feat].dtype != np.object]

print("A total of {} quantitative columns".format(len(quantitative_data)))
print(quantitative_data)

Two types of Quantitative data:

1. Continuous data: Numerical data that can take on any values. A great example would be the the height of a person. Donald is 6 foot or 182.88cm tall.

1. Discrete data: Numerical data that has specific values. A great example would be the number of dogs. The number of dogs are counted as 1 dog, 2 dogs, 3 dogs. There is no such thing as 0.5 dog.


In [None]:
## Continuous data
continuous_feats = [feat for feat in quantitative_data if df_train[feat].nunique() > 25]

continuous_feats

Our targer variable is a continuous data.

In [None]:
## Discrete data
discrete_feats = [feat for feat in quantitative_data if df_train[feat].nunique() <= 25]

discrete_feats

****Binary data****: Numerical data or categorical data that has two unique values represents one of two conceptually opposed values. Eg. "Success" or "Failure" (in categorical data), "1" or "0" (in numerical data).

****One Unique Value Column****: Any column in the dataset that contain only the same value. One unique value column will not help to your model to differentiate between two different labels instead it can even negatively affect your model by creating bias in the data.



Determine whether the discrete features consists of binary data or one unique value column

In [None]:
## Binary Data
binary_feats = [feat for feat in discrete_feats if df_train[feat].nunique() == 2]

## One_unique_feats
one_uni_feats = [feat for feat in discrete_feats if df_train[feat].nunique() == 1]

print("Number of Binary column: {}".format(len(binary_feats)))
print("Number of One unique column: {}".format(len(one_uni_feats)))

### Visualization on Binary Data

In [None]:
zero_value = []
one_value = []

for col in binary_feats:
    zero_value.append(df_train[col].value_counts(normalize=True).loc[0])
    one_value.append(df_train[col].value_counts(normalize=True).loc[1])

count = len(binary_feats)
idx = np.arange(count)
bar_width = 0.5

plt.figure(figsize=(6,125))
p1 = plt.barh(idx,zero_value,bar_width,color="red")
p2 = plt.barh(idx,one_value,bar_width,left=zero_value, color="black")
plt.yticks(idx,binary_feats)
plt.legend((p1[0], p2[0]), ('Zero count', 'One Count'))
plt.show()

In [None]:
## Qualitative Data
qualitative_data = [feat for feat in df_train if df_train[feat].dtype == np.object]
print("A total of {} qualitative columns".format(len(qualitative_data)))
print(qualitative_data)

There's a total of 8 qualitative and 368 quantitative features in this dataset

In [None]:
## Visualization of qualitative columns
for col in qualitative_data:
    fig, ax = plt.subplots(1,1,figsize=(16,5))
    sns.boxplot(df_train[col],df_train["y"],ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
    plt.show()

### Missing Values 

In [None]:
## Assign to Dataframe
df_check = pd.DataFrame()
df_check["Missing Values"] = df_train.isnull().sum()
df_check["Number of Unique Values"] = df_train.nunique()

#Have a look on the dataframe
print("Presence of missing values = {}".format(any(df_check["Missing Values"] > 0)))

Extract the dependant variable "y" out and drop it from the quantitative features.

In [None]:
## extraction of dependant variable
y = df_train["y"]

## drop
df_train = df_train.drop("y",axis=1)

## Data Processing

In [None]:
# Create a new column "train" for easier for separate the data later
df_train["train"] = 1 # 1 indicates train data)
df_test["train"] = 0 # 0 indicates test data)
# Combination of both test and train
df_all = pd.concat([df_train,df_test],axis=0)

### Normalization of Dependant Variable - "y"

In [None]:
## Let's have a look on "y" normal distribution
plt.figure(figsize=(12,9))
sns.distplot(y)
plt.title("Before Normalization")
plt.show()
print("Skewness: {}".format(y.skew()))

In [None]:
## Normalize dependant variable
y_norm = np.log1p(y)

## Visualization
plt.figure(figsize=(12,9))
sns.distplot(y_norm)
plt.title("After Normalization")
plt.show()
print("Skewness: {}".format(y.skew()))

In [None]:
# a = [f for f in df_all if df_all[f].nunique() == 1]
# print(a)
# a = []
# for f in df_all:
#     if df_all[f].nunique() == 1:
#         a.append(f)
# print(a)
# df_all["X1"].nunique()

### Removal of One Unique Values


In [None]:
## Shape of dataset before the removal of one unique feats
print("Shape of the whole dataset before the removal: {}".format(df_all.shape))

## Removal of one unique feats
one_uni_feats = [feat for feat in df_all.columns if df_all[feat].nunique() == 1]
df_all = df_all.drop(one_uni_feats,axis=1)

## Shape of dataset after the removal of one unique feats
print("Shape of the whole dataset after the removal: {}".format(df_all.shape))

### Eliminating Biased Feature
Feature with biased data would negatively affect a model.

In [None]:
threshold = 0.9 ## A unique value that consist more 90% of the feature is consider as biased

## Indentify biased features 
bias_feat = []
for col in df_all:
    feat = df_all[col].value_counts(normalize=True,dropna=False).values[0] > 0.9
    if feat == True:
        bias_feat.append(col)

## Remove them
df_all.drop(bias_feat,axis=1)

## Let's check on the shape of the whole dataset
print("Shape: {}".format(df_all.shape))

### Label Encode Qualitative Data


In [None]:
##Label encode qualitative data
from sklearn.preprocessing import LabelEncoder #Library for LabelEncoding
lbl_encoder = LabelEncoder()

for col in qualitative_data:
    encoded = lbl_encoder.fit_transform(df_all[col])
    df_all[col] = encoded

### Split The Data Back into Train and Test Dataset

In [None]:
# df_train data
df_train = df_all.loc[df_all["train"]== 1,:]
df_train.drop("train",axis=1,inplace=True)

# df_test data
df_test = df_all.loc[df_all["train"]== 0,:] 
df_test.drop("train",axis=1,inplace=True)

## Model Building
### Train_test_split
The dataset are split into X_train,X_test, y_train, y_test

In [None]:
## import necessary package
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Split the data into train and test set
X_train, X_test, y_train, y_test =  train_test_split(df_train,y,test_size=0.33,random_state=42)


## Check on the dataset shape
print("Shapes: ", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### GridSearchCV Best Parameters for Below Models
Searching for the optimal predefined hyperparameters for each individual estimator.
1. XGBoost Regressor
1. ExtraTreeRegressor
1. GradientBoostingRegressor

In [None]:
## import models
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [None]:
## Create a list for store each individual best estimator
best_estimators = []

#### XGBRegressor

In [None]:
## parameters
params = {
    "learning_rate": [.1, .5],
    "colsample_bytree": [.3,.5,],
    "max_depth": [2, 4],
    "alpha": [3, 5],
    "subsample": [.5],
    "n_estimators": [30, 70],
    "random_state" : [42]
}

## XGBoost Regressor
XGBR =  XGBRegressor()
XGBR_grid = GridSearchCV(XGBR, params, scoring='r2', cv=7, n_jobs=-1)
XGBR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(XGBR_grid.best_params_))
print("Best score: {}".format(XGBR_grid.best_score_))

## Append to list
best_estimators.append(["XGBoostR",XGBR_grid.best_estimator_])

#### ExtraTreesRegressor

In [None]:
## Parameters
params = {
    "max_depth": ["None",10],
    "max_features": ["auto",.3, .4],
    "min_samples_leaf": [3,7],
    "min_samples_split": [2, 4],
    "n_estimators": [50, 100],
    "random_state" : [42]
}

## ExtraTreesRegressor
ETR = ExtraTreesRegressor()
ETR_grid = GridSearchCV(ETR, params, scoring='r2', cv=7, n_jobs=-1)
ETR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(ETR_grid.best_params_))
print("Best score: {}".format(ETR_grid.best_score_))

## Append to list
best_estimators.append(["ExtraTreesR",ETR_grid.best_estimator_])

#### GradientBoostingRegressor

In [None]:
## Parameters
params = {
    "max_depth": [2, 3],
    "max_features": ["auto",0.3],
    "min_samples_leaf": [1,3],
    "min_samples_split": [2, 5],
    "n_estimators": [50, 100],
    "random_state" : [42],
    "tol" : [0.0001,0.01]
}

## GradientBoostingRegressor
GBR = GradientBoostingRegressor()
GBR_grid = GridSearchCV(GBR, params, scoring='r2', cv=7, n_jobs=-1)
GBR_grid.fit(X_train, y_train)

## Output
print("Best parameters:  {}:".format(GBR_grid.best_params_))
print("Best score: {}".format(GBR_grid.best_score_))

## Append to list
best_estimators.append(["GradientBoostR",GBR_grid.best_estimator_])

### Evaluation the Optimized Estimators

In [None]:
## import necessary libraries for evaluation
from sklearn.model_selection import cross_val_score,KFold

## create an empty dataframe to store estimator's cross_validation_score
evaluate = pd.DataFrame(columns=["model","std","cv_mean","cv_median"])#note: "cv" for mean cv & "cv_all" for later visualization use

for name,estimator in best_estimators:
    kfold = KFold(n_splits=10,random_state=42,shuffle=True)
    cv = cross_val_score(estimator,X_train,y_train,cv=kfold,n_jobs=-1,scoring="r2")
    
    row = evaluate.shape[0]
    evaluate.loc[row,"model"] = name
    evaluate.loc[row,"cv_mean"] = round(cv.mean(),3)
    evaluate.loc[row,"cv_median"] = round(np.median(cv),3)
    evaluate.loc[row,"std"] = "+/- {}".format(round(cv.std(),4))
    
    evaluate = evaluate.sort_values("cv_mean",ascending=False)

evaluate

### Visualization of Estimator's CV score
Visualize the each estimator mean in bar charts

In [None]:
## Visualization
fig,ax = plt.subplots(1,1,figsize=(12,9))
bar = sns.barplot(evaluate["model"],evaluate["cv_mean"],ax=ax,palette=sns.color_palette("rocket"))
for rec in bar.patches:
    height = rec.get_height()
    ax.text(rec.get_x() + rec.get_width()/2, height *1.02, height,ha="center")
plt.show()

We should be choosing the XGBoostRegressor estimator. Now let's train estimator with the whole training dataset and make prediction.

## Submission

In [None]:
## Optimal XGBRegressor Estimator
XBGR = XGBR_grid.best_estimator_
XBGR.fit(df_train,y)

## Submission

submission = pd.read_csv("../input/mercedes-benz-greener-manufacturing/sample_submission.csv.zip")
submission.iloc[:,1] = XBGR.predict(df_test)

submission.to_csv('submission', index=False)

## Print out the first five row of the submission
submission.head()