In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor

In [None]:
df_train = pd.read_csv("../input/hackerearth-machine-learning-exhibit-art/dataset/train.csv")
df_test = pd.read_csv("../input/hackerearth-machine-learning-exhibit-art/dataset/test.csv")
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [None]:
def log1p(vec):
    return np.log1p(abs(vec))

def expm1(x):
    return np.expm1(x)

In [None]:
cost = df_train['Cost']
df_train["Cost"] = log1p(cost)

In [None]:
df_train

In [None]:
df_train.isnull().sum()

In [None]:
df_test

In [None]:
df_train.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df_train.dtypes

In [None]:
df_train.corr()

In [None]:
report_train = pandas_profiling.ProfileReport(df_train)

In [None]:
report_train

In [None]:
df_train["Artist Reputation"].value_counts()

In [None]:
plot = plt.figure(figsize=(15,5))
ax1 = plot.add_subplot(121)
ax2 = plot.add_subplot(122)
for i,j in df_train["Artist Reputation"].value_counts().iteritems():
    ax1.scatter(i,j,cmap='viridis')
    ax1.set_title('Train Data')
    ax1.set_xlabel('Artist Reputation score')
    ax1.set_ylabel('Count')
for i,j in df_test["Artist Reputation"].value_counts().iteritems():
    ax2.scatter(i, j, cmap='viridis_r')
    ax2.set_title('Test Data')
    ax2.set_xlabel('Artist Reputation score')
    ax2.set_ylabel('Count')

In [None]:
mean_to_be_filled = ["Artist Reputation","Height","Width","Weight"]
not_available_to_be_filled = ["Transport","Material","Remote Location"]

In [None]:
def fill_mean(df):
    values = []
    counts = []
    for i,j in df.value_counts().iteritems():
        values.append(i)
        counts.append(j)
    weighted_avg_artist_reputation = np.average(values,weights=counts)
    df.fillna(weighted_avg_artist_reputation,inplace = True)

In [None]:
for i in mean_to_be_filled:
    fill_mean(df_train[i])
    fill_mean(df_test[i])

In [None]:
def fill_not_available(df):
    df.fillna(df.mode()[0],inplace=True)

In [None]:
for i in not_available_to_be_filled:
    fill_not_available(df_train[i])
    fill_not_available(df_test[i])

In [None]:
list_weight = df_train[df_train["Weight"].isna()].index
list_width = df_train[df_train["Width"].isna()].index
list_height = df_train[df_train["Height"].isna()].index
list_weight_to_be_updated = df_train[df_train["Weight"].isna() & df_train["Height"].notna() & df_train["Width"].notna()].index
list_weight_height_width_available = df_train[df_train["Weight"].notna() & df_train["Height"].notna() & df_train["Width"].notna()].index

In [None]:
list_height_width_na = list(set(list_height) & set(list_width))

In [None]:
df_train.iloc[list_weight_to_be_updated].head()

In [None]:
df_train.iloc[list_weight_to_be_updated].groupby("Material")["Height","Weight","Width"].mean()

In [None]:
df_train.iloc[list_weight_height_width_available].groupby("Material")["Height","Weight","Width"].mean()

In [None]:
df_train.iloc[list_weight_height_width_available].head()

In [None]:
df_train

In [None]:
df_test

In [None]:
def clean_weight(df):
    converted_list_1 = []
    for i in df:
        converted_list_1.append(round(float(i),2))
    return converted_list_1
df_train["Weight"] = clean_weight(df_train["Weight"])
df_test["Weight"] = clean_weight(df_test["Weight"])

In [None]:
df_train

In [None]:
df_test

In [None]:
def change_date_time(df,i):
    return pd.to_datetime(df[i])

In [None]:
cols_to_change_date = ["Scheduled Date","Delivery Date"]

In [None]:
for i in cols_to_change_date:
    df_train[i] = change_date_time(df_train,i)
    df_test[i] = change_date_time(df_test,i)
    

In [None]:
def clean_date(df):
    converted_list_1 = []
    date_diff = df["Scheduled Date"]-df["Delivery Date"]
    for i in date_diff:
        converted_list_1.append(str(i).split()[0])
    return converted_list_1

In [None]:
df_train["date_diff"] = clean_date(df_train)
df_test["date_diff"] = clean_date(df_test)
df_train["date_diff"] = df_train["date_diff"].astype("int")
df_test["date_diff"] = df_test["date_diff"].astype("int")

In [None]:
df_train

In [None]:
def clean_customer_location(df):
    converted_list_1 = []
    for i in df:
        converted_list_1.append(i.split()[-2])
    return converted_list_1

In [None]:
df_train["Customer Location"] = clean_customer_location(df_train["Customer Location"])
df_test["Customer Location"] = clean_customer_location(df_test["Customer Location"])

In [None]:
cols_to_be_one_hot_encoded = ["Material","Transport","Customer Location"]
cols_to_be_ordinally_encoded = ["International","Express Shipment","Installation Included","Fragile","Customer Information","Remote Location"]

In [None]:
def one_hot_encoding(df,column):
    one_hot_encoder=ce.OneHotEncoder(cols=column,return_df=True,use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final

In [None]:
for i in cols_to_be_one_hot_encoded:
    df_train = one_hot_encoding(df_train,i)
    df_test = one_hot_encoding(df_test,i)

In [None]:
def ordinal_encoding(df,column):
    ordinal_encoder = ce.OrdinalEncoder(cols = column,return_df = True,mapping=[{'col':column,'mapping':{'Yes':1,'No':0,'Working Class':0,'Wealthy':1}}])
    df_final=ordinal_encoder.fit_transform(df) 
    return df_final

In [None]:
for i in cols_to_be_ordinally_encoded:
    df_train = ordinal_encoding(df_train,i)
    df_test = ordinal_encoding(df_test,i)

In [None]:
new_order = list(df_train.columns)
new_order.remove("Cost")

In [None]:
df_test = df_test.reindex(columns=new_order)

In [None]:
columns_to_drop = ["Customer Id","Artist Name","Remote Location","Scheduled Date","Delivery Date"]

In [None]:
def drop_columns(df):
    df.drop(columns_to_drop,axis = 1,inplace = True)
drop_columns(df_train)
drop_columns(df_test)

In [None]:
df_train

In [None]:
for i,j in df_train.dtypes.iteritems():
    print(i,j)

In [None]:
X_train = df_train.drop("Cost",axis = 1)
Y_train = df_train["Cost"]
X_test = df_test.copy()

In [None]:
st = StandardScaler()

In [None]:
df_train_standardized = st.fit_transform(X_train)
df_test_standardized = st.fit_transform(X_test)

In [None]:
df_train_final = pd.DataFrame(df_train_standardized,columns = X_train.columns)
df_test_final = pd.DataFrame(df_test_standardized,columns=df_test.columns)

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(df_train_final, Y_train)

In [None]:
Y_prediction_rf = abs(rf.predict(df_test_final))

In [None]:
Y_prediction_rf

In [None]:
Y_pred = []
for i in Y_prediction_rf:
    Y_pred.append(round(float(i),2))

In [None]:
df_submission = pd.DataFrame()
df_submission["Customer Id"] = df_test_copy["Customer Id"]
df_submission["Cost"] = Y_pred
Y_pred = expm1(Y_pred)

In [None]:
df_submission.to_csv("submission_rf_basic.csv")

In [None]:
param_grid = [
{'n_estimators': [50,100,250,500], 
 'max_depth': [10, 50, 100], 'bootstrap': [True, False]}
]

In [None]:
random_search_forest = RandomizedSearchCV(rf, param_grid, cv=10)

In [None]:
random_search_forest.fit(df_train_final, Y_train)

In [None]:
tuned_rf_best_random = random_search_forest.best_estimator_

In [None]:
tuned_rf_best_random

In [None]:
tuned_rf_best_random.fit(df_train_final, Y_train)

In [None]:
tuned_rf_random_pred = abs(tuned_rf_best_random.predict(df_test_final))

In [None]:
tuned_rf_random_pred = expm1(tuned_rf_random_pred)

In [None]:
df_submission = pd.DataFrame()
df_submission["Customer Id"] = df_test_copy["Customer Id"]
df_submission["Cost"] = tuned_rf_random_pred

In [None]:
df_submission.to_csv("submission_rf_hyperparameters.csv")

In [None]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, Y_train)
y_test_pred_lgbm = lgbm.predict(X_test)
y_test_pred_lgbm2 = expm1(y_test_pred_lgbm)

In [None]:
df_submission_lgbm = pd.DataFrame()
df_submission_lgbm["Customer Id"] = df_test_copy["Customer Id"]
df_submission_lgbm["Cost"] = y_test_pred_lgbm2

In [None]:
df_submission_lgbm.to_csv("submission_lgbm.csv")