# HackerEarth Machine Learning Challenge: Exhibit A(rt)
Practicing what I have learned from the things that I have studied :)

In [None]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

#Training data file
train_file = "../input/hackerearth-machine-learning-exhibit-art/dataset/train.csv"
test_file = "../input/hackerearth-machine-learning-exhibit-art/dataset/test.csv"

## Inspect Data

In [None]:
train_df = pd.read_csv(train_file)

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
X = train_df.copy()
X.profile_report()
y = X.pop("Cost")

### Observation
- Price of Sculpture is skewed
- Weight is skewed
- Cost is skewed

In [None]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    color = np.array(["C0"] * scores.shape[0])
    # Color red for probes
    idx = [i for i, col in enumerate(scores.index)
           if col.startswith("PROBE")]
    color[idx] = "C3"
    # Create plot
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

## Split dataset before Imputing missing values

In [None]:
from sklearn.model_selection import train_test_split
# To avoid leakage during imputation
X_train, X_test, y_train, y_test = train_test_split(
    X.to_numpy(),
    y.to_numpy(),
    train_size=0.75,
    random_state=0
    )

## Convert back to Pandas DF for further analysis and processing

In [None]:
X_train_df = pd.DataFrame(X_train, columns=X.columns)
y_train_df = pd.DataFrame(y_train, columns=["Cost"])

In [None]:
X_test_df = pd.DataFrame(X_test, columns=X.columns)
y_test_df = pd.DataFrame(y_test, columns=["Cost"])

## Inspect Missing values

In [None]:
X_train_df.isna().sum()

In [None]:
X_test_df.isna().sum()

In [None]:
# Negative Cost
(y_train_df < 0).sum()

## Impute missing values and drop unnecessary columns

### Drop

In [None]:
columns_to_drop = ["Customer Id", "Artist Name"]
X_train_df.drop(columns_to_drop, inplace=True, axis=1)
print(X_train_df.shape)
X_train_df.head()

In [None]:
X_test_df.drop(columns_to_drop, inplace=True, axis=1)
print(X_test_df.shape)
X_test_df.head()

### Impute

In [None]:
X_train_df["Artist Reputation"].fillna(value=X_train_df["Artist Reputation"].median(), inplace=True)
X_train_df["Artist Reputation"].isna().any()

In [None]:
X_test_df["Artist Reputation"].fillna(value=X_test_df["Artist Reputation"].median(), inplace=True)
X_test_df["Artist Reputation"].isna().any()

In [None]:
X_train_df["Height"].fillna(value=X_train_df["Height"].median(), inplace=True)
X_train_df["Height"].isna().any()

In [None]:
X_test_df["Height"].fillna(value=X_test_df["Height"].median(), inplace=True)
X_test_df["Height"].isna().any()

In [None]:
X_train_df["Width"].fillna(value=X_train_df["Width"].median(), inplace=True)
X_train_df["Width"].isna().any()

In [None]:
X_test_df["Width"].fillna(value=X_test_df["Width"].median(), inplace=True)
X_test_df["Width"].isna().any()

In [None]:
X_train_df["Weight"].fillna(value=X_train_df["Weight"].median(), inplace=True)
X_train_df["Weight"].isna().any()

In [None]:
X_test_df["Weight"].fillna(value=X_test_df["Weight"].median(), inplace=True)
X_test_df["Weight"].isna().any()

In [None]:
X_train_df["Material"].fillna(value=X_train_df["Material"].mode()[0], inplace=True)
X_train_df["Material"].isna().any()

In [None]:
X_test_df["Material"].fillna(value=X_test_df["Material"].mode()[0], inplace=True)
X_test_df["Material"].isna().any()

In [None]:
# Try other method of imputation
X_train_df["Transport"].fillna(value=X_train_df["Transport"].mode()[0], inplace=True)
X_train_df["Transport"].isna().any()

In [None]:
X_test_df["Transport"].fillna(value=X_test_df["Transport"].mode()[0], inplace=True)
X_test_df["Transport"].isna().any()

In [None]:
# Try other method of imputation
X_train_df["Remote Location"].fillna(value=X_train_df["Remote Location"].mode()[0], inplace=True)
X_train_df["Remote Location"].isna().any()

In [None]:
X_test_df["Remote Location"].fillna(value=X_test_df["Remote Location"].mode()[0], inplace=True)
X_test_df["Remote Location"].isna().any()

## Parse Date Columns

In [None]:
X_train_df["Scheduled Date"] = pd.to_datetime(X_train_df["Scheduled Date"], format="%m/%d/%y")
X_train_df["Delivery Date"] = pd.to_datetime(X_train_df["Delivery Date"], format="%m/%d/%y")

In [None]:
X_test_df["Scheduled Date"] = pd.to_datetime(X_test_df["Scheduled Date"], format="%m/%d/%y")
X_test_df["Delivery Date"] = pd.to_datetime(X_test_df["Delivery Date"], format="%m/%d/%y")

## Change Data type of "Price Of Sculpture" and "Base Shipping Price" to numeric

In [None]:
X_train_df["Base Shipping Price"] = pd.to_numeric(X_train_df["Base Shipping Price"])
X_train_df["Price Of Sculpture"] = pd.to_numeric(X_train_df["Price Of Sculpture"])

In [None]:
X_test_df["Base Shipping Price"] = pd.to_numeric(X_test_df["Base Shipping Price"])
X_test_df["Price Of Sculpture"] = pd.to_numeric(X_test_df["Price Of Sculpture"])

## Extract Features

In [None]:
X_1 = pd.DataFrame()
X_1["ZipCode"] = X_train_df["Customer Location"].map(lambda x: pd.to_numeric(str(x).split()[-1]))

In [None]:
X_1t = pd.DataFrame()
X_1t["ZipCode"] = X_test_df["Customer Location"].map(lambda x: pd.to_numeric(str(x).split()[-1]))

In [None]:
X_2 = pd.DataFrame()
X_2["SchedDeliveryDiff"] = (X_train_df["Delivery Date"]-X_train_df["Scheduled Date"]).map(lambda x: str(x).split()[0])
X_2["SchedDeliveryDiff"] = pd.to_numeric(X_2["SchedDeliveryDiff"])

In [None]:
X_2t = pd.DataFrame()
X_2t["SchedDeliveryDiff"] = (X_test_df["Delivery Date"]-X_test_df["Scheduled Date"]).map(lambda x: str(x).split()[0])
X_2t["SchedDeliveryDiff"] = pd.to_numeric(X_2t["SchedDeliveryDiff"])

In [None]:
X_3 = pd.DataFrame()
X_3["ShipDay"] = X_train_df["Scheduled Date"].dt.day
X_3["ShipMonth"] = X_train_df["Scheduled Date"].dt.month
X_3["ShipYear"] = X_train_df["Scheduled Date"].dt.year
X_3["DeliveryDay"] = X_train_df["Delivery Date"].dt.day
X_3["DeliveryMonth"] = X_train_df["Delivery Date"].dt.month
X_3["DeliveryYear"] = X_train_df["Delivery Date"].dt.year

In [None]:
X_3t = pd.DataFrame()
X_3t["ShipDay"] = X_test_df["Scheduled Date"].dt.day
X_3t["ShipMonth"] = X_test_df["Scheduled Date"].dt.month
X_3t["ShipYear"] = X_test_df["Scheduled Date"].dt.year
X_3t["DeliveryDay"] = X_test_df["Delivery Date"].dt.day
X_3t["DeliveryMonth"] = X_test_df["Delivery Date"].dt.month
X_3t["DeliveryYear"] = X_test_df["Delivery Date"].dt.year

In [None]:
X_4 = pd.DataFrame()
X_4["State"] = X_train_df["Customer Location"].map(lambda x: str(x).split()[-2])

In [None]:
X_4t = pd.DataFrame()
X_4t["State"] = X_test_df["Customer Location"].map(lambda x: str(x).split()[-2])

In [None]:
X_5 = pd.DataFrame()
X_5["LogPriceSculpture"] = X_train_df["Price Of Sculpture"].apply(np.log1p)

In [None]:
X_5t = pd.DataFrame()
X_5t["LogPriceSculpture"] = X_test_df["Price Of Sculpture"].apply(np.log1p)

In [None]:
X_6 = pd.DataFrame()
X_6["LogWeight"] = X_train_df["Weight"].apply(np.log1p)

In [None]:
X_6t = pd.DataFrame()
X_6t["LogWeight"] = X_test_df["Weight"].apply(np.log1p)

In [None]:
X_7 = pd.DataFrame()
X_7["AvgShipPriceGrpByTransportRemote"] = X_train_df.groupby(["Transport","Remote Location"])["Base Shipping Price"].transform("mean")

In [None]:
X_7t = pd.DataFrame()
X_7t["AvgShipPriceGrpByTransportRemote"] = X_test_df.groupby(["Transport","Remote Location"])["Base Shipping Price"].transform("mean")

## Combined original training data with engineered features

In [None]:
X_train_df_engrd = X_train_df.join([X_1, X_2, X_3, X_4, X_5, X_6, X_7])
X_train_df_engrd.profile_report()

In [None]:
X_test_df_engrd = X_test_df.join([X_1t, X_2t, X_3t, X_4t, X_5t, X_6t, X_7t])

## Drop Redundant Columns

In [None]:
columns_to_drop = ["Scheduled Date", "Delivery Date", "Price Of Sculpture", "Weight", "Customer Location"]
X_train_df_engrd.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
X_test_df_engrd.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
X_train_df_engrd.profile_report()

## Additional Feature(Average Shipping Price Per State)

In [None]:
X_8 = pd.DataFrame()
X_8["AvgShipPricePerState"] = X_train_df_engrd.groupby(["State"])["Base Shipping Price"].transform("mean")
X_train_df_engrd = X_train_df_engrd.join([X_8])

In [None]:
X_8t = pd.DataFrame()
X_8t["AvgShipPricePerState"] = X_test_df_engrd.groupby(["State"])["Base Shipping Price"].transform("mean")
X_test_df_engrd = X_test_df_engrd.join([X_8t])

## Label Encode

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(["Aluminium", "Wood", "Brass", "Bronze", "Clay", "Marble", "Stone"])
X_train_df_engrd["Material"] = enc.transform(X_train_df_engrd["Material"])

In [None]:
enc.fit(["Aluminium", "Wood", "Brass", "Bronze", "Clay", "Marble", "Stone"])
X_test_df_engrd["Material"] = enc.transform(X_test_df_engrd["Material"])

In [None]:
enc.fit(["Working Class", "Wealthy"])
X_train_df_engrd["Customer Information"] = enc.transform(X_train_df_engrd["Customer Information"])

In [None]:
enc.fit(["Working Class", "Wealthy"])
X_test_df_engrd["Customer Information"] = enc.transform(X_test_df_engrd["Customer Information"])

In [None]:
enc.fit(["No", "Yes"])
X_train_df_engrd["International"] = enc.transform(X_train_df_engrd["International"])

In [None]:
X_train_df_engrd["Express Shipment"] = enc.transform(X_train_df_engrd["Express Shipment"])
X_train_df_engrd["Installation Included"] = enc.transform(X_train_df_engrd["Installation Included"])
X_train_df_engrd["Fragile"] = enc.transform(X_train_df_engrd["Fragile"])
X_train_df_engrd["Remote Location"] = enc.transform(X_train_df_engrd["Remote Location"])

In [None]:
enc.fit(["No", "Yes"])
X_test_df_engrd["International"] = enc.transform(X_test_df_engrd["International"])
X_test_df_engrd["Express Shipment"] = enc.transform(X_test_df_engrd["Express Shipment"])
X_test_df_engrd["Installation Included"] = enc.transform(X_test_df_engrd["Installation Included"])
X_test_df_engrd["Fragile"] = enc.transform(X_test_df_engrd["Fragile"])
X_test_df_engrd["Remote Location"] = enc.transform(X_test_df_engrd["Remote Location"])

## One Hot Encode Transport

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)
X_9 = pd.DataFrame(ohe.fit_transform(X_train_df_engrd[["Transport"]]), columns=ohe.get_feature_names(["Transport"]))
X_train_df_engrd.drop(["Transport"], axis=1, inplace=True)
X_train_df_engrd = X_train_df_engrd.join([X_9])

In [None]:
X_9t = pd.DataFrame(ohe.fit_transform(X_test_df_engrd[["Transport"]]), columns=ohe.get_feature_names(["Transport"]))
X_test_df_engrd.drop(["Transport"], axis=1, inplace=True)
X_test_df_engrd = X_test_df_engrd.join([X_9])

### Drop ZipCode and State
I decided to drop State since I was already able to derive the average price per state
The ZipCodes appeared to be completely unique, so decided not to use it

In [None]:
X_train_df_engrd.drop(["State"], axis=1, inplace=True)
X_train_df_engrd.drop(["ZipCode"], axis=1, inplace=True)

In [None]:
X_test_df_engrd.drop(["State"], axis=1, inplace=True)
X_test_df_engrd.drop(["ZipCode"], axis=1, inplace=True)

In [None]:
X_train_df_engrd.head()

### Determine and Plot Mutual Info score

In [None]:
mi_scores = make_mi_scores(X_train_df_engrd, y_train_df["Cost"])
plot_mi_scores(mi_scores)

In [None]:
plot_mi_scores(mi_scores[:10])

## Normalize Cost

In [None]:
y_train_df["Cost"] = np.log1p(abs(y_train_df["Cost"]))
y_test_df["Cost"] = np.log1p(abs(y_test_df["Cost"]))

## Try out some regression models(Using all features)

### Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression
lnr = LinearRegression(normalize=True)
lnr.fit(X_train_df_engrd.to_numpy(), y_train_df["Cost"].to_numpy())
print(lnr.score(X_test_df_engrd.to_numpy(), y_test_df["Cost"].to_numpy()))

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grd = GradientBoostingRegressor(learning_rate=0.2, n_estimators=250, random_state=30)
grd.fit(X_train_df_engrd.to_numpy(), y_train_df["Cost"].to_numpy())
print(grd.score(X_test_df_engrd.to_numpy(), y_test_df["Cost"].to_numpy()))

### RandomForest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100,random_state=42)
rfr.fit(X_train_df_engrd.to_numpy(), y_train_df["Cost"].to_numpy())
print(rfr.score(X_test_df_engrd.to_numpy(), y_test_df["Cost"].to_numpy()))

### AdaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(random_state=0, n_estimators=150, learning_rate=0.001, loss="square")
ada.fit(X_train_df_engrd.to_numpy(), y_train_df["Cost"].to_numpy())
print(ada.score(X_test_df_engrd.to_numpy(), y_test_df["Cost"].to_numpy()))

## Prepare Result for Submission

In [None]:
def inverseLog(cost):
    return np.expm1(cost)

In [None]:
from sklearn.metrics import mean_squared_error

model = GradientBoostingRegressor(learning_rate=0.2, n_estimators=250, random_state=30)
model.fit(X_train_df_engrd.to_numpy(), y_train_df["Cost"].to_numpy())
preds = model.predict(X_test_df_engrd)

# Evaluate the model
score = mean_squared_error(y_test_df["Cost"].to_numpy(), preds)
print('MSE:', score)

In [None]:
test_df = pd.read_csv(test_file)
customer_id = test_df["Customer Id"]

In [None]:
def prepAndExtractFeatures(df):
    df_copy = df.copy()
    columns_to_drop = ["Customer Id", "Artist Name"]
    df_copy.drop(columns_to_drop, inplace=True, axis=1)
    #Impute
    df_copy["Artist Reputation"].fillna(value=df_copy["Artist Reputation"].median(), inplace=True)
    df_copy["Height"].fillna(value=df_copy["Height"].median(), inplace=True)
    df_copy["Width"].fillna(value=df_copy["Width"].median(), inplace=True)
    df_copy["Weight"].fillna(value=df_copy["Weight"].median(), inplace=True)
    df_copy["Material"].fillna(value=df_copy["Material"].mode()[0], inplace=True)
    df_copy["Transport"].fillna(value=df_copy["Transport"].mode()[0], inplace=True)
    df_copy["Remote Location"].fillna(value=df_copy["Remote Location"].mode()[0], inplace=True)
    
    df_copy["Scheduled Date"] = pd.to_datetime(df_copy["Scheduled Date"], format="%m/%d/%y")
    df_copy["Delivery Date"] = pd.to_datetime(df_copy["Delivery Date"], format="%m/%d/%y")
    
    df_copy["Base Shipping Price"] = pd.to_numeric(df_copy["Base Shipping Price"])
    df_copy["Price Of Sculpture"] = pd.to_numeric(df_copy["Price Of Sculpture"])
    
    X_1 = pd.DataFrame()
    X_1["SchedDeliveryDiff"] = (df_copy["Delivery Date"]-df_copy["Scheduled Date"]).map(lambda x: str(x).split()[0])
    X_1["SchedDeliveryDiff"] = pd.to_numeric(X_1["SchedDeliveryDiff"])
    X_2 = pd.DataFrame()
    X_2["ShipDay"] = df_copy["Scheduled Date"].dt.day
    X_2["ShipMonth"] = df_copy["Scheduled Date"].dt.month
    X_2["ShipYear"] = df_copy["Scheduled Date"].dt.year
    X_2["DeliveryDay"] = df_copy["Delivery Date"].dt.day
    X_2["DeliveryMonth"] = df_copy["Delivery Date"].dt.month
    X_2["DeliveryYear"] = df_copy["Delivery Date"].dt.year
    
    X_3 = pd.DataFrame()
    X_3["State"] = df_copy["Customer Location"].map(lambda x: str(x).split()[-2])
    X_4 = pd.DataFrame()
    X_4["LogPriceSculpture"] = df_copy["Price Of Sculpture"].apply(np.log1p)
    X_5 = pd.DataFrame()
    X_5["LogWeight"] = df_copy["Weight"].apply(np.log1p)
    X_6 = pd.DataFrame()
    X_6["AvgShipPriceGrpByTransportRemote"] = df_copy.groupby(["Transport","Remote Location"])["Base Shipping Price"].transform("mean")
    df_copy = df_copy.join([X_1, X_2, X_3, X_4, X_5, X_6])
    columns_to_drop = ["Scheduled Date", "Delivery Date", "Price Of Sculpture", "Weight", "Customer Location"]
    df_copy.drop(columns_to_drop, axis=1, inplace=True)
    
    X_7 = pd.DataFrame()
    X_7["AvgShipPricePerState"] = df_copy.groupby(["State"])["Base Shipping Price"].transform("mean")
    df_copy = df_copy.join([X_7])
    
    enc = LabelEncoder()
    enc.fit(["Aluminium", "Wood", "Brass", "Bronze", "Clay", "Marble", "Stone"])
    df_copy["Material"] = enc.transform(df_copy["Material"])
    
    enc.fit(["Working Class", "Wealthy"])
    df_copy["Customer Information"] = enc.transform(df_copy["Customer Information"])
    
    enc.fit(["No", "Yes"])
    df_copy["International"] = enc.transform(df_copy["International"])
    df_copy["Express Shipment"] = enc.transform(df_copy["Express Shipment"])
    df_copy["Installation Included"] = enc.transform(df_copy["Installation Included"])
    df_copy["Fragile"] = enc.transform(df_copy["Fragile"])
    df_copy["Remote Location"] = enc.transform(df_copy["Remote Location"])
    
    ohe = OneHotEncoder(sparse=False)
    X_8 = pd.DataFrame(ohe.fit_transform(df_copy[["Transport"]]), columns=ohe.get_feature_names(["Transport"]))
    df_copy.drop(["Transport"], axis=1, inplace=True)
    df_copy = df_copy.join([X_8])
    df_copy.drop(["State"], axis=1, inplace=True)
    
    return df_copy

def prepareResult(res):
    return np.log1p(res)

In [None]:
X = prepAndExtractFeatures(test_df)
preds = model.predict(X.to_numpy())
result = inverseLog(preds).round(2)
submission = pd.DataFrame({"Customer Id":customer_id, "Cost":result})
submission.head()

In [None]:
#submission.to_csv('./submission_ver3.csv', index=False)

## TODO: Try Other Feature Engineering methods/techniques