# HackerEarth ML- Exhibit A(rt) : Sculpture shipping cost prediction:


Importing necceessary modules :

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Reading the training data:

In [None]:
df = pd.read_csv('../input/hackerearth-machine-learning-exhibit-art/dataset/train.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='O')

## Checking and Handling NULL values :

In [None]:
print("Null values in each column : \n")
df.isna().sum()

Filling NA values :

In [None]:
# Categorical variables
transport_mode = df.Transport.mode()[0]
df.Transport.fillna(transport_mode,inplace=True)

material_mode = df.Material.mode()[0]
df.Material.fillna(material_mode,inplace=True)

remote_loc_mode = df['Remote Location'].mode()[0]
df['Remote Location'].fillna(remote_loc_mode,inplace=True)

In [None]:
# Numerical variables
artist_rep_mean = np.ceil(df['Artist Reputation'].mean())
df['Artist Reputation'].fillna(artist_rep_mean,inplace=True)

mean_height = np.ceil(df.Height.mean())
df.Height.fillna(mean_height,inplace=True)

mean_width = np.ceil(df.Width.mean())
df.Width.fillna(mean_width,inplace=True)

median_weight = np.ceil(df.Weight.median())
df.Weight.fillna(median_weight,inplace=True)

In [None]:
df.head()

## Checking value counts : 

In [None]:
# Categorical columns
cols=['Material','International','Express Shipment','Installation Included','Transport','Fragile','Customer Information','Remote Location']

for i in cols:
    print("**column =>",i,"\n")
    print(df[i].value_counts())
    print("\n-----------\n")

In [None]:
# Categorical columns
cols=['Material','International','Express Shipment','Installation Included','Transport','Fragile','Customer Information','Remote Location']
colors=["#FA7921","#DC0073","#89FC00","#44bba4","#e7bb41","#da4167","#53ff45","#2a9d8f"]
idx=0
for i in cols:
    print("**column =>",i,"\n")
    res = dict(df[i].value_counts())
    X = res.keys()
    Y = res.values()
    fig = plt.figure(figsize = (10, 5))
    plt.bar(X, Y, color =colors[idx], width = 0.4)
    plt.xlabel(i)
    plt.ylabel("Counts")
    plt.title("Different types of "+i)
    plt.show()
    idx+=1
    print("\n-----------\n")

## Visualizing the dataset:

In [None]:
columns = ['Artist Reputation','Height','Width','Weight','Price Of Sculpture','Base Shipping Price','Cost']
sns.pairplot(df[columns])
plt.show()

#### Corelation Matrix:

In [None]:
corr = df.corr()
fig = plt.figure(figsize=(8,6))
sns.heatmap(corr,annot=True)
plt.show()

## Feature Engineering:

1. Difference between 'Delivery Date' and 'Scheduled Date'

In [None]:
# Difference between 'Delivery Date' and 'Scheduled Date'
df["dates_diff"] = (pd.to_datetime(df['Delivery Date'])-pd.to_datetime(df['Scheduled Date']))/pd.offsets.Day(1)
df.head()

2. Extracting the 'State' of the customer from the 'Customer Location'

In [None]:
customer_states=[]
customer_pin = []
for idx,i in enumerate(df['Customer Location'].values):
    res = str(i).split(",")    
    if len(res)==1:
        state = res[0].split()[1].strip()
        # pin = res[0].split()[2].strip()
        # print("State : ",state," , Pin : ",pin)
    else:
        state = res[1].split()[0].strip()
        # pin = res[1].split()[1].strip()
        # print("State : ",state," , Pin : ",pin)

    customer_states.append(state)
    # customer_pin.append(pin)

df["customer_states"] = customer_states
# df["customer_pin"] = customer_pin
df.head()

## Replacing Catagorical features to numerical columns :

In [None]:
# Creating dummy Features for :
# Material column
df = pd.concat([df,pd.get_dummies(df.Material,prefix="material")],axis=1)

# Transport column
df = pd.concat([df,pd.get_dummies(df.Transport,prefix="transport")],axis=1)

In [None]:
# Label Encoding for :
# International 
df.International, international_map = df.International.factorize()

# Fragile
df.Fragile, fragile_map = df.Fragile.factorize()

# Express Shipment
df["Express Shipment"], express_shipment_map = df["Express Shipment"].factorize()

# Installation Included
df["Installation Included"], installation_map = df["Installation Included"].factorize()

# Customer Information
df["Customer Information"], customer_information_map = df["Customer Information"].factorize()

# customer_states
df.customer_states,customer_states_map = df.customer_states.factorize()

# Remote Location
df["Remote Location"], remote_map = df["Remote Location"].factorize()

df.head()

Dropping the unneccessary columns :

In [None]:
df.drop(columns=['Customer Id', 'Artist Name', 'Material','Transport','Delivery Date','Scheduled Date','Customer Location'],inplace=True)

Transformng the target Column "Cost":

In [None]:
df.Cost = df.Cost.apply(lambda x : np.log(np.abs(x)))
df.head()

In [None]:
df.describe()

## Scaling columns :

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['Height','Width','Weight','Price Of Sculpture', 'Base Shipping Price']] = scaler.fit_transform(df[['Height','Width','Weight','Price Of Sculpture', 'Base Shipping Price']])
df.head()

In [None]:
df.to_csv("train_Data.csv",index=False)

## Preparing Test Data:

Reading the test dataset and checking null values.

In [None]:
test_df = pd.read_csv('../input/hackerearth-machine-learning-exhibit-art/dataset/test.csv')
print("Null values in each column : \n")
test_df.isna().sum()

Filling Null Values :

In [None]:
# Categorical columns
test_df.Transport.fillna(transport_mode,inplace=True)
test_df.Material.fillna(material_mode,inplace=True)
test_df['Remote Location'].fillna(remote_loc_mode,inplace=True)

# Numerical Columns
test_df['Artist Reputation'].fillna(artist_rep_mean,inplace=True)
test_df.Height.fillna(mean_height,inplace=True)
test_df.Width.fillna(mean_width,inplace=True)
test_df.Weight.fillna(median_weight,inplace=True)

# Creating dummy variables
test_df = pd.concat([test_df,pd.get_dummies(test_df.Material,prefix="material")],axis=1)
test_df = pd.concat([test_df,pd.get_dummies(test_df.Transport,prefix="transport")],axis=1)

Feature engineering :

In [None]:
# difference between 'Delivery Date' and 'Scheduled Date'
test_df["dates_diff"] = (pd.to_datetime(test_df['Delivery Date'])-pd.to_datetime(test_df['Scheduled Date']))/pd.offsets.Day(1)

# Getting Customer State from Customer Location
customer_states=[]
customer_pin = []
for idx,i in enumerate(test_df['Customer Location'].values):
    res = str(i).split(",")    
    if len(res)==1:
        state = res[0].split()[1].strip()
        # pin = res[0].split()[2].strip()
        # print("State : ",state," , Pin : ",pin)
    else:
        state = res[1].split()[0].strip()
        # pin = res[1].split()[1].strip()
        # print("State : ",state," , Pin : ",pin)
    customer_states.append(state)
    # customer_pin.append(pin)
test_df["customer_states"] = customer_states
# test_df["customer_pin"] = customer_pin

Label Encoding the columns containing the categorical values :

In [None]:
int_map = dict()
for idx,i in enumerate(international_map):
    int_map[i] = idx

frag_map = dict()
for idx,i in enumerate(fragile_map):
    frag_map[i] = idx

ci_map = dict()
for idx,i in enumerate(customer_information_map):
    ci_map[i] = idx

express_map = dict()
for idx,i in enumerate(express_shipment_map):
    express_map[i] = idx

ii_map = dict()
for idx,i in enumerate(installation_map):
    ii_map[i] = idx

remotel_map = dict()
for idx,i in enumerate(remote_map):
    remotel_map[i] = idx

cs_map = dict()
for idx,i in enumerate(customer_states_map):
    cs_map[i] = idx

In [None]:
# Label Encoding 
test_df.International.replace(int_map,inplace=True)
test_df.Fragile.replace(frag_map,inplace=True)
test_df["Express Shipment"].replace(express_map,inplace=True)
test_df["Installation Included"].replace(ii_map,inplace=True)
test_df["Customer Information"].replace(ci_map,inplace=True)
test_df.customer_states.replace(cs_map,inplace=True)
test_df["Remote Location"].replace(remotel_map,inplace=True)

Removing the unneccessary columns :

In [None]:
test_df.drop(columns=['Artist Name', 'Material','Transport','Scheduled Date','Delivery Date','Customer Location'],inplace=True)

Scaling the numerical columns :

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
test_df[['Height','Width','Weight','Price Of Sculpture', 'Base Shipping Price']] = scaler.fit_transform(test_df[['Height','Width','Weight','Price Of Sculpture', 'Base Shipping Price']])
test_df.head()

Rearranging the columns order in the test data as given in the training data : 

In [None]:
rearrange=['Customer Id','Artist Reputation','Height','Width','Weight','Price Of Sculpture','Base Shipping Price','International','Express Shipment','Installation Included','Fragile','Customer Information','Remote Location','dates_diff','customer_states','material_Aluminium','material_Brass','material_Bronze','material_Clay','material_Marble','material_Stone','material_Wood','transport_Airways','transport_Roadways','transport_Waterways']
test_df = test_df[rearrange]

In [None]:
test_df.to_csv('test_Data.csv',index=False)

## Splitting Training Dataset into Train and Test sets  :

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('train_Data.csv')
X, X_test, Y, Y_test = train_test_split(df.drop(columns=['Cost']), df.Cost.abs(), test_size=0.18)

print("X shape : ",X.shape)
print("Y shape : ",Y.shape)
print("X_test shape  : ",X_test.shape)
print("Y_test shape  : ",Y_test.shape)

## Model Training :

Importing the required modules.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

#### 1. Ridge Regression: 

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.3,max_iter=10)
ridge_model.fit(X, Y)
ridge_pred = ridge_model.predict(X_test)

ridge_pred_train = ridge_model.predict(X)
ridge_pred_test  = ridge_model.predict(X_test)

ridge_rmse_train = round(np.sqrt(mean_squared_error(Y,ridge_pred_train)),4)
ridge_rmse_test  = round(np.sqrt(mean_squared_error(Y_test,ridge_pred_test)),4)

ridge_r2_train = round(r2_score(Y,ridge_pred_train),4)
ridge_r2_test  = round(r2_score(Y_test,ridge_pred_test),4)

print("Training : ")
print("    RMSE = ",ridge_rmse_train)
print("R2 Score = ",ridge_r2_train)

print("\nTesting : ")
print("    RMSE = ",ridge_rmse_test)
print("R2 Score = ",ridge_r2_test)

#### 2. Decision tree Regression :

In [None]:
from sklearn.tree import DecisionTreeRegressor

DT_model = DecisionTreeRegressor()
DT_model.fit(X,Y)

DT_pred_train = DT_model.predict(X)
DT_pred_test = DT_model.predict(X_test)

DT_rmse_train = round(np.sqrt(mean_squared_error(Y,DT_pred_train)),4)
DT_rmse_test  = round(np.sqrt(mean_squared_error(Y_test,DT_pred_test)),4)

DT_r2_train = round(r2_score(Y,DT_pred_train),4)
DT_r2_test  = round(r2_score(Y_test,DT_pred_test),4)

print("Training : ")
print("    RMSE = ",DT_rmse_train)
print("R2 Score = ",DT_r2_train)

print("\nTesting : ")
print("    RMSE = ",DT_rmse_test)
print("R2 Score = ",DT_r2_test)

#### 3. Random Forest Regressor: 

In [None]:
from sklearn.ensemble import RandomForestRegressor

RF_model = RandomForestRegressor(min_samples_split=2,n_estimators=500)
RF_model.fit(X, Y)

RF_pred_train = RF_model.predict(X)
RF_pred_test = RF_model.predict(X_test)

RF_rmse_train = round(np.sqrt(mean_squared_error(Y,RF_pred_train)),4)
RF_rmse_test  = round(np.sqrt(mean_squared_error(Y_test,RF_pred_test)),4)

RF_r2_train = round(r2_score(Y,RF_pred_train),4)
RF_r2_test  = round(r2_score(Y_test,RF_pred_test),4)

print("Training : ")
print("    RMSE = ",RF_rmse_train)
print("R2 Score = ",RF_r2_train)

print("\nTesting : ")
print("    RMSE = ",RF_rmse_test)
print("R2 Score = ",RF_r2_test)

#### 4. Gradient Boosting Regressor :

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

boost_model = GradientBoostingRegressor(max_features=20,min_samples_split=1000,min_samples_leaf=30,n_estimators= 500)
boost_model.fit(X, Y)

boost_pred_train = boost_model.predict(X)
boost_pred_test = boost_model.predict(X_test)

boost_rmse_train = round(np.sqrt(mean_squared_error(Y,boost_pred_train)),4)
boost_rmse_test  = round(np.sqrt(mean_squared_error(Y_test,boost_pred_test)),4)

boost_r2_train = round(r2_score(Y,boost_pred_train),4)
boost_r2_test  = round(r2_score(Y_test,boost_pred_test),4)

print("Training : ")
print("    RMSE = ",boost_rmse_train)
print("R2 Score = ",boost_r2_train)

print("\nTesting : ")
print("    RMSE = ",boost_rmse_test)
print("R2 Score = ",boost_r2_test)

#### 5. XGBoost :

In [None]:
from xgboost import XGBRegressor

xgboost_model = XGBRegressor( objective='reg:squarederror',n_estimators=150,gamma=0.5, max_depth=5)

xgboost_model.fit(X,Y)

xgboost_pred_train = xgboost_model.predict(X)
xgboost_pred_test = xgboost_model.predict(X_test)

xgboost_rmse_train = round(np.sqrt(mean_squared_error(Y,xgboost_pred_train)),4)
xgboost_rmse_test  = round(np.sqrt(mean_squared_error(Y_test,xgboost_pred_test)),4)

xgboost_r2_train = round(r2_score(Y,xgboost_pred_train),4)
xgboost_r2_test  = round(r2_score(Y_test,xgboost_pred_test),4)

print("Training : ")
print("    RMSE = ",xgboost_rmse_train)
print("R2 Score = ",xgboost_r2_train)

print("\nTesting : ")
print("    RMSE = ",xgboost_rmse_test)
print("R2 Score = ",xgboost_r2_test)

## Final Report :

In [None]:
report = pd.DataFrame({
    'Models': [ "Random Forest", "Ridge Regression", "Decision Tree", "Gradient Boosting ", "XGBoost"],
    'RMSE Train': [ RF_rmse_train,  ridge_rmse_train, DT_rmse_train,  boost_rmse_train, xgboost_rmse_train],
    'R2 Train': [ RF_r2_train, ridge_r2_train, DT_r2_train,  boost_r2_train, xgboost_r2_train ],
    'RMSE Test': [RF_rmse_test, ridge_rmse_test, DT_rmse_test,  boost_rmse_test, xgboost_rmse_test ],
    'R2 Test': [RF_r2_test, ridge_r2_test, DT_r2_test,  boost_r2_test, xgboost_r2_test]
})

print("Models Report sorted by R2 Test:\n")
report.sort_values(by='R2 Test', ascending=False).reset_index(drop=True)

## Prediction on Test Data:

In [None]:
test_data = pd.read_csv('test_Data.csv')

pred = xgboost_model.predict(test_data.drop(columns=["Customer Id"]))
submission = pd.DataFrame(pred,columns=['Cost'])

submission = pd.concat([test_data["Customer Id"],submission],axis=1)
submission.Cost = submission.Cost.apply(lambda x : np.exp(x))
submission.head()

*You have reached the end of this notebook.*