# My first notebook

## Any suggestions will be highly appreciated 

The work is divided into three Sections

1. Section 1 - Data Analysis (Simple Exploratory Data Analysis to gain insights)
2. Section 2 - Data Handling (Performing imputation, scaling, etc..)
3. Section 3 - Model fitting and predicting (Used XGBoostRegressor to fit the model)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
pd.pandas.set_option('display.max_columns', None)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **SECTION 1 - Data Analysis**

In [None]:
dataset = pd.read_csv('/kaggle/input/hackerearth-machine-learning-exhibit-art/dataset/train.csv')
test = pd.read_csv('/kaggle/input/hackerearth-machine-learning-exhibit-art/dataset/test.csv')
dataset.head()

In [None]:
# Could have worked with date columns but dropped it anyway
dataset.drop(['Customer Id', 'Artist Name' ,'Customer Location', 'Scheduled Date', 'Delivery Date'], axis=1, inplace=True)
test.drop(['Customer Id', 'Artist Name' ,'Customer Location', 'Scheduled Date', 'Delivery Date'], axis=1, inplace=True)
dataset['Cost'] = dataset['Cost'].abs()

## Missing Data Exploration 

In [None]:
# Getting all columns with missing data
missing_data_col = [col for col in dataset.columns if dataset[col].isnull().sum() > 0]
missing_data_col

In [None]:
dataset[missing_data_col].isnull().sum()

Try to see how cost gets affected on columns with missing data

In [None]:
df = dataset[missing_data_col].copy()
df.fillna("-1", inplace=True)
for col in missing_data_col:
    df[col] = df[col].apply(lambda x: 'Missing' if x=='-1' else 'Available')
    
df['Cost'] = dataset['Cost']

for col in missing_data_col:
    df.groupby(col)['Cost'].median().plot.bar()
    plt.xlabel(col)
    plt.ylabel('Cost')
    plt.title(col)
    plt.show()

### The median cost of missing data values and the median cost of available data values for the missing data columns do not vary by much

# Working with the numerical columns to get some insights about outliers and what role does it play in the cost

In [None]:
# !pip install dtale
# import dtale
# # Getting all the numerical columns
# numerical_cols = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']

# df = dataset[numerical_cols].copy()
# details = dtale.show(df)
# details

#### Note - dtale doesn't works well with kaggle notebook so the below conclusions were drawn from using dtale on Jupyter Notebook 
## Following conclusions are to be drawn
#### 1. Height is moderately skewed (0.59)
#### 2. Width is highly skewed (1.55)
#### 3. Weight is highly skewed (21.56)
#### 4. Price of Sculpture is highly skewed (22.21)
#### 5. Base shipping price is moderately skewed (0.92)
#### 6. Cost is highly skewed (29.87)


# Looking for outliers in the data

In [None]:
numerical_cols = [feature for feature in dataset.columns if dataset[feature].dtypes != 'O']

df = dataset[numerical_cols].copy()
df.fillna(0, inplace=True)

for col in df.columns:
    plt.boxplot(col, data=df)
    plt.xlabel(col)
    plt.title(col)
    plt.show()

# High outliers have led to high skewness in these columns

# Check for categorical data and finding insights

In [None]:
# Getting all the categorical columns
categorical_cols = [feature for feature in dataset.columns if dataset[feature].dtypes == 'O']
categorical_cols

df = dataset[categorical_cols].copy()
df["Cost"] = dataset['Cost']

for col in df.columns[:-1]:
    df.groupby(col)['Cost'].median().plot.bar()
    plt.xlabel(col)
    plt.ylabel('COST')
    plt.title(col)
    plt.show()

#### NOTE - Not considering mean as the computing factor as mean is influneced by outliers
### Some insights are
##### 1. Marble and stone products are costly followed by brass and bronze
##### 2. International column has almost no impact on cost
##### 3. Express shipment leads to a slight increment in the cost
##### 4. Installation included has almost no to very little impact on cost
##### 5. Transport has very slight impact on cost (airways being the costliest and waterways being the cheapest
#### 6. Fragility has a high impact on cost (Not fragile = More cost)
##### 7. Customer information has slight impact on cost
##### 8. Remote location has almost no impact on cost

# Checking the corelation of numerical columns w.r.t Cost

In [None]:
dataset.corr()['Cost']

# **SECTION 2 - Data Handling**

In [None]:
dataset.head(3)

In [None]:
X_train = dataset.iloc[:, :-1]
y_train = dataset['Cost']

###  The y values are highly skewed so we need to normalize them. 
We can perform antilog to get back the original results

In [None]:
y_train = y_train.apply(lambda x: np.log(x) if x>0.0 else 0.0)
y_train

## Handle missing data

In [None]:
# Find all the numberical cols with missing data
num_missing_cols = [col for col in dataset.columns if dataset[col].dtypes!='O' and dataset[col].isnull().sum() > 0]
num_missing_cols

In [None]:
df = dataset.copy()
for col in num_missing_cols:
    mean = df[col].mean()
    median = df[col].median()
    print(f"{col} has a mean of {round(mean,2)} and a median of {median}")

In [None]:
def fill_misisng_cols(col_name, df):
    for col in col_name:
        med = df[col].median()
        df[col].fillna(med, inplace=True)
        
fill_misisng_cols(col_name=num_missing_cols, df=X_train)
fill_misisng_cols(col_name=num_missing_cols, df=test)


X_train[num_missing_cols].isnull().sum()

### Get all the categorical columns to fill missing values and also do other transformations if possible

In [None]:
cat_missing_cols = [col for col in dataset.columns if dataset[col].dtypes=='O' and dataset[col].isnull().sum()>0]
cat_missing_cols

In [None]:
# Filling the categorical columns with missing data with the mode of these columns

def cat_miss(df):
    for col in cat_missing_cols:
        mode = dataset[col].mode()[0]
        df[col].fillna(mode, inplace=True)

cat_miss(X_train)
cat_miss(test)
X_train[cat_missing_cols].head()

In [None]:
# Label encoding using pd.get_dummies()
X_train = pd.get_dummies(X_train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

# SECTION 3 - Model Fitting and Prediction

In [None]:
# Height', 'Width', 'Weight', 'Price Of Sculpture' needs to be scaled down as they have high outliers
num_cols_all = [col for col in X_train.columns if X_train[col].dtypes!='O' and X_train[col].nunique() > 10]
num_cols_all
scale_cols = num_cols_all[1:-1]
scale_cols

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()

X_train[scale_cols] = scale.fit_transform(X_train[scale_cols])
test[scale_cols] = scale.transform(test[scale_cols])

X_train.head(3)

## Finding the useful parameters 

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

#### At the moment we will take alpha as 0.001 and select the columns generated.
Based on results optained we may update alpha

In [None]:
feature_selection_model = SelectFromModel(Lasso(alpha=0.001))
feature_selection_model.fit(X_train, y_train)
f_model_arr = list(feature_selection_model.get_support())

all_cols = [i for i in X_train.columns]
cols_chosen = [all_cols[i] for i in range(len(f_model_arr)) if f_model_arr[i] == True]
# cols_chosen # Chosen columns

In [None]:
X_train = X_train[cols_chosen]
test = test[cols_chosen]

X_train = X_train.values
test = test.values

## Using XGBoost to fit the model

In [None]:
from xgboost import XGBRegressor
xg = XGBRegressor()

xg.fit(X_train, y_train)

In [None]:
# # Performing the Grid Search to get the best parameters
# from sklearn.model_selection import GridSearchCV

# param_grid = [
# {'n_estimators': [250,280,300,330, 360], 
#  'max_depth': [10, 20,30,40,50],
#  'learning_rate': [0.1,0.3,0.5],
# }]

# grid_cv = GridSearchCV(xg, param_grid=param_grid, cv=10, n_jobs=-1)
# grid_cv.fit(X_train, y_train)

# best_params = grid_cv.best_estimator_
# best_params

### The Best Parameters Were found out to be 
n_estimators=250
max_depth=10 
learning_rate=0.1

In [None]:
xg1 = XGBRegressor(n_estimators=250, n_jobs=-1, max_depth=10, base_score=0.1, learning_rate=0.1)
xg1.fit(X_train, y_train)

In [None]:
predicted = xg1.predict(test)
predicted

## The 'Cost' column in the train dataset was log normalised so we need to antilog the predicted values

In [None]:
predicted = np.power(np.e, predicted)
predicted = np.round(predicted, 2)
predicted

In [None]:
test = pd.read_csv('/kaggle/input/hackerearth-machine-learning-exhibit-art/dataset/test.csv')

id_col = pd.DataFrame(test['Customer Id'], columns=['Customer Id'])
cost_col = pd.DataFrame(predicted, columns=['Cost'])
result = pd.concat([id_col, cost_col], axis=1)
result.head()

In [None]:
result.to_csv('Submission.csv', index=False)