In [59]:
import pandas as pd
import category_encoders as ce
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import seaborn as sns
import matplotlib as mlb
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn import  metrics


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from tpot import TPOTRegressor

import xgboost as xgb
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from IPython.display import display
import os
#plt.style.use("seaborn")
%matplotlib inline  
import warnings
warnings.filterwarnings("ignore")

In [60]:
data = pd.read_csv(r'cac_dataset/customer_acquisition_costs.csv')

In [61]:
def preprocess_train(data):

    train_data = data.drop(["avg_cars_at home(approx).1"], axis=1)
    train_data["avg. yearly_income"] = train_data['avg. yearly_income'].str.replace(r'\$', '', regex=True) #cleaned yearly income
    def media_cleaner(value):
        if 'Daily' in value:
            value = "Daily Media"
        elif 'Sunday' in value:
            value = "Sunday Media"
        return value
    train_data["media_type"] = train_data["media_type"].apply(media_cleaner) #cleaned mediatype

    # Map the data value to strength score
    education_mapping = {
        "Partial High School": 1,
        "High School Degree": 2,
        "Partial College": 3,
        "Bachelors Degree": 4,
        "Graduate Degree": 5
    }

    houseowner_mapping = {"Y": 1, "N": 0}

    occupation_mapping = {'Manual' :1,
                        'Skilled Manual':2,
                        'Clerical':3,
                        'Professional':4,
                        'Management':5}

    member_card_mapping = {
        "Normal": 1,
        "Bronze": 2,
        "Silver": 3,
        "Golden": 4
    }
    income_mapping = {
        "10K - 30K": 1,
        "30K - 50K": 2,
        "50K - 70K": 3,
        "70K - 90K": 4,
        "90K - 110K": 5,
        "110K - 130K": 6,
        "130K - 150K": 7,
        "150K +": 8
    }
    car_mapping = {
        "0 car": 0.0,
        "1 car": 1.0,
        "2 car": 2.0,
        "3 car": 3.0,
        "4 car": 4.0
    }

    weights = {
        "education_encoded": 0.1,
        "houseowner_encoded": 0.15,
        "member_card_encoded": 0.2,
        "income_encoded": 0.3, 
        "occupation_encoded": 0.1,
        "cars_at_home" : 0.15
    }

    train_data["profile_strength"] = (
        (train_data["education"].map(education_mapping)) * weights["education_encoded"] +
        (train_data["houseowner"].map(houseowner_mapping)) * weights["houseowner_encoded"] +
        (train_data["member_card"].map(member_card_mapping)) * weights["member_card_encoded"] +
        (train_data["avg. yearly_income"].map(income_mapping)) * weights["income_encoded"] +
        (train_data["occupation"].map(occupation_mapping)) * weights["occupation_encoded"] +
        (train_data["avg_cars_at home(approx)"]) * weights["cars_at_home"] 
    ) 

    ordinal_cat = ["education","occupation","member_card","avg. yearly_income","store_type"]
    dic = {}
    for i in range(len(ordinal_cat)):
        dic[ordinal_cat[i]] = train_data[ordinal_cat[i]].unique()


    codes = [0,1,2,3,4,5,6,7]
    ed_map = pd.DataFrame({"education" : ["Partial High School","High School Degree","Partial College","Bachelors Degree","Graduate Degree"],
                        "encoding_values" : codes[:5]})
    mem_map = pd.DataFrame({"member_card":["Normal","Bronze","Silver","Golden"],
                            "encoding_values" : codes[:4]})
    occ_map = pd.DataFrame({"occupation" : ['Manual', 'Skilled Manual', 'Clerical', 'Professional', 'Management'],
                        "encoding_values" : codes[:5]})
    inc_map = pd.DataFrame({"avg. yearly_income" :['10K - 30K', '30K - 50K', '50K - 70K', '70K - 90K','90K - 110K', '110K - 130K', '130K - 150K', '150K +'],
                            "encoding_values" : codes})
    store_map = pd.DataFrame({"store_type":['Small Grocery', 'Mid-Size Grocery', 'Supermarket', 'Gourmet Supermarket', 'Deluxe Supermarket'],
                            "encoding_values" : codes[:5]})

    ordinal_order = [ed_map[ordinal_cat[0]],occ_map[ordinal_cat[1]],mem_map[ordinal_cat[2]],inc_map[ordinal_cat[3]],store_map[ordinal_cat[4]]]

    for i in range(len(ordinal_order)):
        train_data[ordinal_cat[i]] = pd.Categorical(train_data[ordinal_cat[i]], categories=ordinal_order[i], ordered=True).codes


    nominal_cat = train_data.select_dtypes('object').columns



    # Apply binary encoding
    binary_encoder = ce.BinaryEncoder(cols=nominal_cat)
    train_encoded = binary_encoder.fit_transform(train_data)

    def get_stats(data,col):
        data_column = data[col]
        op_mean = np.mean(data_column)
        op_median = np.median(data_column)
        op_std = np.std(data_column)
        op_var = np.var(data_column)
        op_min = np.min(data_column)
        op_max = np.max(data_column)
        op_1q = data_column.quantile(0.25)
        op_3q = data_column.quantile(0.75)
        op_iqr = op_3q - op_1q
        op_upper = min((op_3q + 1.5*(op_iqr)),data_column.max())
        op_lower = max((op_1q - 1.5*(op_iqr)),data_column.min())
        op_outliers = ((data_column < op_lower) | (data_column > op_upper)).sum()
        dic_stats = {"Mean": [op_mean],
                    "Median":[op_median],
                    "Standard-Deviation": [op_std],
                    "Variance": [op_var],
                    "Minimum Value": [op_min],
                    "Maximum Value": [op_max],
                    "Q1": [op_1q],
                    "Q3": [op_3q],
                    "IQR": [op_iqr],
                    "Upper Fence":[op_upper],
                    "Lower Fence": [op_lower],
                    "Outliers count":[op_outliers]}

        return dic_stats

    dic_sales = get_stats(train_encoded, "store_sales(in millions)") 
    train_encoded["store_sales(in millions)"] = np.where(train_encoded["store_sales(in millions)"] > dic_sales["Upper Fence"][0], dic_sales["Upper Fence"][0], (np.where(train_encoded["store_sales(in millions)"] < dic_sales["Lower Fence"][0], dic_sales["Lower Fence"][0], train_encoded["store_sales(in millions)"])))

    dic_cost = get_stats(train_encoded, "store_cost(in millions)") 
    train_encoded["store_cost(in millions)"] = np.where(train_encoded["store_cost(in millions)"] > dic_cost["Upper Fence"][0], dic_cost["Upper Fence"][0], (np.where(train_encoded["store_cost(in millions)"] < dic_cost["Lower Fence"][0], dic_cost["Lower Fence"][0], train_encoded["store_cost(in millions)"])))

    scaler = MinMaxScaler()
    preprocessed_data = pd.DataFrame(scaler.fit_transform(train_encoded), index=train_encoded.index, columns=train_encoded.columns)

    return preprocessed_data, binary_encoder, scaler

In [4]:
def preprocess_test(data, binary_encoder, scaler):

    train_data = data.drop(["avg_cars_at home(approx).1"], axis=1)
    train_data["avg. yearly_income"] = train_data['avg. yearly_income'].str.replace(r'\$', '', regex=True) #cleaned yearly income
    def media_cleaner(value):
        if 'Daily' in value:
            value = "Daily Media"
        elif 'Sunday' in value:
            value = "Sunday Media"
        return value
    train_data["media_type"] = train_data["media_type"].apply(media_cleaner) #cleaned mediatype

    # Map the data value to strength score
    education_mapping = {
        "Partial High School": 1,
        "High School Degree": 2,
        "Partial College": 3,
        "Bachelors Degree": 4,
        "Graduate Degree": 5
    }

    houseowner_mapping = {"Y": 1, "N": 0}

    occupation_mapping = {'Manual' :1,
                        'Skilled Manual':2,
                        'Clerical':3,
                        'Professional':4,
                        'Management':5}

    member_card_mapping = {
        "Normal": 1,
        "Bronze": 2,
        "Silver": 3,
        "Golden": 4
    }
    income_mapping = {
        "10K - 30K": 1,
        "30K - 50K": 2,
        "50K - 70K": 3,
        "70K - 90K": 4,
        "90K - 110K": 5,
        "110K - 130K": 6,
        "130K - 150K": 7,
        "150K +": 8
    }
    car_mapping = {
        "0 car": 0.0,
        "1 car": 1.0,
        "2 car": 2.0,
        "3 car": 3.0,
        "4 car": 4.0
    }

    weights = {
        "education_encoded": 0.1,
        "houseowner_encoded": 0.15,
        "member_card_encoded": 0.2,
        "income_encoded": 0.3, 
        "occupation_encoded": 0.1,
        "cars_at_home" : 0.15
    }

    train_data["profile_strength"] = (
        (train_data["education"].map(education_mapping)) * weights["education_encoded"] +
        (train_data["houseowner"].map(houseowner_mapping)) * weights["houseowner_encoded"] +
        (train_data["member_card"].map(member_card_mapping)) * weights["member_card_encoded"] +
        (train_data["avg. yearly_income"].map(income_mapping)) * weights["income_encoded"] +
        (train_data["occupation"].map(occupation_mapping)) * weights["occupation_encoded"] +
        (train_data["avg_cars_at home(approx)"]) * weights["cars_at_home"] 
    ) 

    ordinal_cat = ["education","occupation","member_card","avg. yearly_income","store_type"]
    dic = {}
    for i in range(len(ordinal_cat)):
        dic[ordinal_cat[i]] = train_data[ordinal_cat[i]].unique()


    codes = [0,1,2,3,4,5,6,7]
    ed_map = pd.DataFrame({"education" : ["Partial High School","High School Degree","Partial College","Bachelors Degree","Graduate Degree"],
                        "encoding_values" : codes[:5]})
    mem_map = pd.DataFrame({"member_card":["Normal","Bronze","Silver","Golden"],
                            "encoding_values" : codes[:4]})
    occ_map = pd.DataFrame({"occupation" : ['Manual', 'Skilled Manual', 'Clerical', 'Professional', 'Management'],
                        "encoding_values" : codes[:5]})
    inc_map = pd.DataFrame({"avg. yearly_income" :['10K - 30K', '30K - 50K', '50K - 70K', '70K - 90K','90K - 110K', '110K - 130K', '130K - 150K', '150K +'],
                            "encoding_values" : codes})
    store_map = pd.DataFrame({"store_type":['Small Grocery', 'Mid-Size Grocery', 'Supermarket', 'Gourmet Supermarket', 'Deluxe Supermarket'],
                            "encoding_values" : codes[:5]})

    ordinal_order = [ed_map[ordinal_cat[0]],occ_map[ordinal_cat[1]],mem_map[ordinal_cat[2]],inc_map[ordinal_cat[3]],store_map[ordinal_cat[4]]]

    for i in range(len(ordinal_order)):
        train_data[ordinal_cat[i]] = pd.Categorical(train_data[ordinal_cat[i]], categories=ordinal_order[i], ordered=True).codes


    nominal_cat = train_data.select_dtypes('object').columns



    # Apply binary encoding
    train_encoded = binary_encoder.transform(train_data)

    def get_stats(data,col):
        data_column = data[col]
        op_mean = np.mean(data_column)
        op_median = np.median(data_column)
        op_std = np.std(data_column)
        op_var = np.var(data_column)
        op_min = np.min(data_column)
        op_max = np.max(data_column)
        op_1q = data_column.quantile(0.25)
        op_3q = data_column.quantile(0.75)
        op_iqr = op_3q - op_1q
        op_upper = min((op_3q + 1.5*(op_iqr)),data_column.max())
        op_lower = max((op_1q - 1.5*(op_iqr)),data_column.min())
        op_outliers = ((data_column < op_lower) | (data_column > op_upper)).sum()
        dic_stats = {"Mean": [op_mean],
                    "Median":[op_median],
                    "Standard-Deviation": [op_std],
                    "Variance": [op_var],
                    "Minimum Value": [op_min],
                    "Maximum Value": [op_max],
                    "Q1": [op_1q],
                    "Q3": [op_3q],
                    "IQR": [op_iqr],
                    "Upper Fence":[op_upper],
                    "Lower Fence": [op_lower],
                    "Outliers count":[op_outliers]}

        return dic_stats

    dic_sales = get_stats(train_encoded, "store_sales(in millions)") 
    train_encoded["store_sales(in millions)"] = np.where(train_encoded["store_sales(in millions)"] > dic_sales["Upper Fence"][0], dic_sales["Upper Fence"][0], (np.where(train_encoded["store_sales(in millions)"] < dic_sales["Lower Fence"][0], dic_sales["Lower Fence"][0], train_encoded["store_sales(in millions)"])))

    dic_cost = get_stats(train_encoded, "store_cost(in millions)") 
    train_encoded["store_cost(in millions)"] = np.where(train_encoded["store_cost(in millions)"] > dic_cost["Upper Fence"][0], dic_cost["Upper Fence"][0], (np.where(train_encoded["store_cost(in millions)"] < dic_cost["Lower Fence"][0], dic_cost["Lower Fence"][0], train_encoded["store_cost(in millions)"])))

    preprocessed_data = pd.DataFrame(scaler.transform(train_encoded), index=train_encoded.index, columns=train_encoded.columns)

    return preprocessed_data

In [5]:
lr = LinearRegression()
svr = SVR()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()
mlpr = MLPRegressor()
lgbmr = LGBMRegressor()
gbr = GradientBoostingRegressor()
cbr = CatBoostRegressor()
model_list = [lr,svr,dtr,rfr,mlpr,gbr,cbr,lgbmr]
model_names = ["LinearRegression","SupportVectorRegressor","DecisionTreeRegressor","RandomForestRegressor",
                "MLPRegressor", "LGBMRegressor","GradientBoostingRegressor","CatBoostRegressor"]

In [6]:
def modelling(data, model):
    train, test = train_test_split(data, test_size=0.2, shuffle=True, random_state=42)
    X_train = train.drop("cost", axis=1)
    X_test = test.drop("cost", axis=1)
    y_train = train["cost"]
    y_test = test["cost"]
    X_train, binary_encoder, scaler = preprocess_train(X_train)
    X_test = preprocess_test(X_test, binary_encoder=binary_encoder, scaler=scaler)
    model.fit(X_train, y_train)
    predicted_values = model.predict(X_test)
    mse = mean_squared_error(y_test, predicted_values)
    mae = mean_absolute_error(y_test, predicted_values)
    r2_value = r2_score(y_test, predicted_values)
    adjusted_r2 = 1 - (1-r2_value)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

    print("model used : ",model)
    print("Mean Absolute Error : ", mae)
    print("Mean Squared Error : ", mse)
    print("R2 - score : ", r2_value)
    print("Adjusted R2 - score : ",adjusted_r2)
    return r2_value, adjusted_r2, mse, mae, model, X_train, X_test

In [7]:
r2_list = []
adjr2_list = []
mse_list = []
mae_list = []
trained_models = []
for model in model_list:
    model_r2,model_adjr2, mse, mae, trained_model, X_train_post, X_test_post = modelling(data, model)
    r2_list.append(model_r2)
    adjr2_list.append(model_adjr2)
    mse_list.append(mse)
    mae_list.append(mae)
    trained_models.append(trained_model)

model used :  LinearRegression()
Mean Absolute Error :  24.216674010041782
Mean Squared Error :  810.1593381992396
R2 - score :  0.10333158308051293
Adjusted R2 - score :  0.0978071918681207
model used :  SVR()
Mean Absolute Error :  17.62762777763817
Mean Squared Error :  601.3803396625232
R2 - score :  0.3344040712654416
Adjusted R2 - score :  0.33030332205835167
model used :  DecisionTreeRegressor()
Mean Absolute Error :  0.07825004137037414
Mean Squared Error :  2.6874100198576865
R2 - score :  0.9970256274605493
Adjusted R2 - score :  0.9970073022946249
model used :  RandomForestRegressor()
Mean Absolute Error :  0.1288801340395395
Mean Squared Error :  1.699324987667551
R2 - score :  0.9981192205351722
Adjusted R2 - score :  0.9981076330170308
model used :  MLPRegressor()
Mean Absolute Error :  0.6831033463912701
Mean Squared Error :  3.5785350633696558
R2 - score :  0.9960393477938615
Adjusted R2 - score :  0.9960149461401062
model used :  GradientBoostingRegressor()
Mean Absolu

In [8]:
scores_data = pd.DataFrame({"Model" : model_names,
                            "MAE" : mae_list,
                            "MSE" : mse_list,
                            "R2-score" : r2_list,
                            "Adjusted R2":adjr2_list})
scores_data

Unnamed: 0,Model,MAE,MSE,R2-score,Adjusted R2
0,LinearRegression,24.216674,810.159338,0.103332,0.097807
1,SupportVectorRegressor,17.627628,601.38034,0.334404,0.330303
2,DecisionTreeRegressor,0.07825,2.68741,0.997026,0.997007
3,RandomForestRegressor,0.12888,1.699325,0.998119,0.998108
4,MLPRegressor,0.683103,3.578535,0.996039,0.996015
5,LGBMRegressor,17.512286,460.472259,0.490358,0.487218
6,GradientBoostingRegressor,0.701029,3.426037,0.996208,0.996185
7,CatBoostRegressor,3.234687,21.212309,0.976523,0.976378


In [9]:
scores_data.to_csv('cac_dataset/scores_dataset.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [43]:
def preprocessor(X, split):
    X = X.drop(["avg_cars_at home(approx).1"], axis=1)
    X["avg. yearly_income"] = X['avg. yearly_income'].str.replace(r'\$', '', regex=True)

    def media_cleaner(value):
        if 'Daily' in value:
            value = "Daily Media"
        elif 'Sunday' in value:
            value = "Sunday Media"
        return value
    
    X["media_type"] = X["media_type"].apply(media_cleaner) 


    education_mapping = {"Partial High School": 1,"High School Degree": 2,"Partial College": 3,"Bachelors Degree": 4,"Graduate Degree": 5}
    houseowner_mapping = {"Y": 1, "N": 0}
    occupation_mapping = {'Manual' :1,'Skilled Manual':2,'Clerical':3,'Professional':4,'Management':5}
    member_card_mapping = {"Normal": 1,"Bronze": 2,"Silver": 3,"Golden": 4}
    income_mapping = {"10K - 30K": 1,"30K - 50K": 2,"50K - 70K": 3,"70K - 90K": 4,"90K - 110K": 5,"110K - 130K": 6,"130K - 150K": 7,"150K +": 8}
    car_mapping = {"0 car": 0.0,"1 car": 1.0,"2 car": 2.0,"3 car": 3.0,"4 car": 4.0}
    weights = {"education_encoded": 0.1,"houseowner_encoded": 0.15,"member_card_encoded": 0.2,"income_encoded": 0.3, "occupation_encoded": 0.1,"cars_at_home" : 0.15}
    X["profile_strength"] = (
        (X["education"].map(education_mapping)) * weights["education_encoded"] +
        (X["houseowner"].map(houseowner_mapping)) * weights["houseowner_encoded"] +
        (X["member_card"].map(member_card_mapping)) * weights["member_card_encoded"] +
        (X["avg. yearly_income"].map(income_mapping)) * weights["income_encoded"] +
        (X["occupation"].map(occupation_mapping)) * weights["occupation_encoded"] +
        (X["avg_cars_at home(approx)"]) * weights["cars_at_home"] 
    ) 
    ordinal_cat = ["education","occupation","member_card","avg. yearly_income","store_type"]


    codes = [0,1,2,3,4,5,6,7]
    ed_map = {"education" : ["Partial High School","High School Degree","Partial College","Bachelors Degree","Graduate Degree"],"encoding_values" : codes[:5]}
    mem_map = {"member_card":["Normal","Bronze","Silver","Golden"],"encoding_values" : codes[:4]}
    occ_map = {"occupation" : ['Manual', 'Skilled Manual', 'Clerical', 'Professional', 'Management'],"encoding_values" : codes[:5]}
    inc_map = {"avg. yearly_income" :['10K - 30K', '30K - 50K', '50K - 70K', '70K - 90K','90K - 110K', '110K - 130K', '130K - 150K', '150K +'],"encoding_values" : codes}
    store_map = {"store_type":['Small Grocery', 'Mid-Size Grocery', 'Supermarket', 'Gourmet Supermarket', 'Deluxe Supermarket'],"encoding_values" : codes[:5]}
    ordinal_order = [ed_map[ordinal_cat[0]],occ_map[ordinal_cat[1]],mem_map[ordinal_cat[2]],inc_map[ordinal_cat[3]],store_map[ordinal_cat[4]]]
    for i in range(len(ordinal_order)):
        X[ordinal_cat[i]] = pd.Categorical(X[ordinal_cat[i]], categories=ordinal_order[i], ordered=True).codes

    def get_stats(data,col):
        data_column = data[col]
        op_mean = np.mean(data_column)
        op_median = np.median(data_column)
        op_std = np.std(data_column)
        op_var = np.var(data_column)
        op_min = np.min(data_column)
        op_max = np.max(data_column)
        op_1q = data_column.quantile(0.25)
        op_3q = data_column.quantile(0.75)
        op_iqr = op_3q - op_1q
        op_upper = min((op_3q + 1.5*(op_iqr)),data_column.max())
        op_lower = max((op_1q - 1.5*(op_iqr)),data_column.min())
        op_outliers = ((data_column < op_lower) | (data_column > op_upper)).sum()
        dic_stats = {"Mean": [op_mean],
                    "Median":[op_median],
                    "Standard-Deviation": [op_std],
                    "Variance": [op_var],
                    "Minimum Value": [op_min],
                    "Maximum Value": [op_max],
                    "Q1": [op_1q],
                    "Q3": [op_3q],
                    "IQR": [op_iqr],
                    "Upper Fence":[op_upper],
                    "Lower Fence": [op_lower],
                    "Outliers count":[op_outliers]}

        return dic_stats
    
    dic_sales = get_stats(X, "store_sales(in millions)") 
    X["store_sales(in millions)"] = np.where(X["store_sales(in millions)"] > dic_sales["Upper Fence"][0], dic_sales["Upper Fence"][0], (np.where(X["store_sales(in millions)"] < dic_sales["Lower Fence"][0], dic_sales["Lower Fence"][0], X["store_sales(in millions)"])))
    dic_cost = get_stats(X, "store_cost(in millions)") 
    X["store_cost(in millions)"] = np.where(X["store_cost(in millions)"] > dic_cost["Upper Fence"][0], dic_cost["Upper Fence"][0], (np.where(X["store_cost(in millions)"] < dic_cost["Lower Fence"][0], dic_cost["Lower Fence"][0],X["store_cost(in millions)"])))


    train, test = train_test_split(X,test_size=split,random_state=42,shuffle=True)
    X_train = train.drop("cost", axis=1)
    X_test = test.drop("cost", axis=1)
    y_train = train["cost"]
    y_test = test["cost"]
     

    nominal_cat = X_train.select_dtypes('object').columns
    binary_encoder = ce.BinaryEncoder(cols=nominal_cat)
    X_train = binary_encoder.fit_transform(X_train)
    X_test = binary_encoder.transform(X_test)

    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test
    

In [44]:
data = pd.read_csv(r'cac_dataset/customer_acquisition_costs.csv')

In [45]:
split_size = 0.15
X_train, X_test, y_train, y_test = preprocessor(data, split_size)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((51363, 74), (9065, 74), (51363,), (9065,))

In [19]:
grid = { 
    'n_estimators': [100,200,300,400,500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth' : [None,4,5,6,7,8],
    'min_samples_split' : [2,6,10],
    'min_samples_leaf':[1,3,5,7],
    'bootstrap':[True, False],
    'oob_score' : [True, False]
    }

In [20]:
random_search = RandomizedSearchCV(RandomForestRegressor(), grid, n_iter=50, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [21]:
cv_results = pd.DataFrame(random_search.cv_results_).sort_values(by = 'rank_test_score')
cv_results.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_oob_score,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
42,171.768685,0.775888,0.37035,0.039138,True,300,10,3,,,True,"{'oob_score': True, 'n_estimators': 300, 'min_...",0.997936,0.997656,0.997855,0.997291,0.997917,0.997731,0.000241,1
12,22.224756,0.513094,0.616083,0.026865,True,200,6,3,sqrt,,True,"{'oob_score': True, 'n_estimators': 200, 'min_...",0.946312,0.945731,0.942262,0.944043,0.946122,0.944894,0.001541,2
43,49.182122,2.549344,1.626756,0.39677,True,500,2,7,log2,,True,"{'oob_score': True, 'n_estimators': 500, 'min_...",0.852224,0.853348,0.850254,0.854424,0.852363,0.852523,0.001383,3
28,7.240027,0.258755,0.263374,0.021277,False,100,10,7,log2,,True,"{'oob_score': False, 'n_estimators': 100, 'min...",0.85694,0.854227,0.848913,0.850151,0.845123,0.851071,0.00413,4
13,258.017296,4.235564,0.567689,0.083211,False,400,6,1,,8.0,False,"{'oob_score': False, 'n_estimators': 400, 'min...",0.839475,0.843293,0.83078,0.798911,0.838105,0.830113,0.01612,5


In [22]:
cv_results.to_csv('cac_dataset/rcv_scores_dataset.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [46]:
model_rf = RandomForestRegressor(n_estimators=300, min_samples_split=10,min_samples_leaf=3,bootstrap=True,oob_score=True)
model_rf.fit(X_train, y_train)
predicted_values_new = model_rf.predict(X_test)

In [47]:
mse = mean_squared_error(y_test,predicted_values_new)
r2_value = r2_score(y_test, predicted_values_new)
adjusted_r2 = 1 - (1-r2_value)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("MSE : ",mse)
print("r2_score for first model : ",r2_value)
print("adjusted r2 : ",adjusted_r2)


MSE :  1.713679676666459
r2_score for first model :  0.9980964122480205
adjusted r2 :  0.9980807431163579


In [48]:
model_rf.feature_importances_

array([5.90060955e-06, 1.57028709e-05, 9.25639252e-06, 1.24579363e-05,
       1.67362742e-05, 6.37807971e-06, 1.52896763e-05, 7.33396377e-06,
       1.43433719e-05, 1.08004912e-05, 5.38200354e-06, 1.02950770e-05,
       1.52839669e-05, 1.04229332e-04, 1.16092716e-04, 5.33653076e-05,
       5.36530732e-02, 7.90271891e-02, 6.91414605e-02, 7.41039567e-02,
       8.66137607e-02, 5.27211938e-02, 7.84029810e-03, 7.46931056e-03,
       2.30025842e-05, 1.73222840e-05, 7.13172915e-05, 6.20571151e-05,
       4.30099929e-04, 4.06928507e-04, 1.30065187e-04, 1.67179144e-04,
       4.44611732e-05, 4.64945803e-05, 1.84749779e-04, 2.75559370e-04,
       1.39450202e-04, 1.12233712e-05, 1.38879942e-05, 4.39731218e-05,
       1.16691255e-05, 1.77473528e-05, 1.15520294e-05, 9.72887533e-06,
       1.56460150e-04, 9.11535012e-05, 1.25601779e-04, 8.28150174e-06,
       1.97423416e-05, 1.09599415e-04, 2.77494312e-02, 4.68232200e-03,
       1.15520398e-02, 1.37809420e-02, 1.98020380e-02, 8.71985683e-03,
      

In [49]:
importances = model_rf.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances}).sort_values(by='Importance', ascending=False)

In [50]:
feature_list = []
mse_list = []
r2_list = []
adjr2_list = []
for i in range(5,50):
    top_features = feature_importance_df['Feature'][:i].values
    X_train_selected = X_train[top_features]
    X_test_selected = X_test[top_features]
    model_checked = RandomForestRegressor(n_estimators=300, min_samples_split=10,min_samples_leaf=3,bootstrap=True,oob_score=True)
    model_checked.fit(X_train_selected, y_train)
    predicted_values = model_checked.predict(X_test_selected)
    mse = mean_squared_error(y_test,predicted_values)
    mse_list.append(mse)
    r2_value = r2_score(y_test, predicted_values)
    r2_list.append(r2_value)
    adjusted_r2 = 1 - (1-r2_value)*(len(y_test)-1)/(len(y_test)-X_test_selected.shape[1]-1)
    adjr2_list.append(adjusted_r2)
    feature_list.append(i)
    print("run no..............  ",i)
    print("adjusted r2 score ::::::::::  ",adjusted_r2)

   

run no..............   5
adjusted r2 score ::::::::::   0.644537585674265
run no..............   6
adjusted r2 score ::::::::::   0.6444211309859971
run no..............   7
adjusted r2 score ::::::::::   0.8500759294486382
run no..............   8
adjusted r2 score ::::::::::   0.9077152321335862
run no..............   9
adjusted r2 score ::::::::::   0.9566758317809717
run no..............   10
adjusted r2 score ::::::::::   0.9914466710114592
run no..............   11
adjusted r2 score ::::::::::   0.9914466768075686
run no..............   12
adjusted r2 score ::::::::::   0.9914486704731236
run no..............   13
adjusted r2 score ::::::::::   0.9914476528569195
run no..............   14
adjusted r2 score ::::::::::   0.996118949296434
run no..............   15
adjusted r2 score ::::::::::   0.9961150116262864
run no..............   16
adjusted r2 score ::::::::::   0.9964727432159579
run no..............   17
adjusted r2 score ::::::::::   0.9964746382864222
run no.............

In [58]:
feature_select_df = pd.DataFrame({"Top K features":feature_list,
                                  "MSE" : mse_list,
                                  "R2-score" : r2_list,
                                  "Adjusted R2" : adjr2_list}) 
feature_select_df.head()

Unnamed: 0,Top K features,MSE,R2-score,Adjusted R2
0,5,319.823811,0.644734,0.644538
1,6,319.893274,0.644657,0.644421
2,7,134.862927,0.850192,0.850076
3,8,83.004815,0.907797,0.907715
4,9,38.963282,0.956719,0.956676


In [57]:
feature_select_df.sort_values(by = 'Adjusted R2', ascending=False).head()

Unnamed: 0,Top K features,MSE,R2-score,Adjusted R2
31,36,0.653139,0.999274,0.999272
32,37,0.834858,0.999073,0.999069
34,39,1.009328,0.998879,0.998874
35,40,1.057151,0.998826,0.99882
30,35,1.066179,0.998816,0.998811


In [53]:
top_features = feature_importance_df['Feature'][:36].values
X_train_sel = X_train[top_features]
X_train_sel.columns


Index(['promotion_name_4', 'promotion_name_1', 'grocery_sqft',
       'promotion_name_3', 'promotion_name_2', 'store_sqft', 'media_type_3',
       'media_type_1', 'promotion_name_0', 'promotion_name_5', 'frozen_sqft',
       'meat_sqft', 'store_type', 'media_type_0', 'store_state_3',
       'media_type_2', 'store_city_3', 'store_city_2', 'store_state_2',
       'store_state_1', 'store_city_1', 'video_store', 'store_city_4',
       'florist', 'sales_country_0', 'sales_country_1', 'coffee_bar',
       'store_city_0', 'prepared_food', 'salad_bar', 'store_state_0',
       'profile_strength', 'total_children', 'education', 'avg. yearly_income',
       'avg_cars_at home(approx)'],
      dtype='object')

In [54]:
feature_select_df.to_csv('cac_dataset/feature_select_dataset.csv', sep='\t', encoding='utf-8', index=False, header=True)

In [55]:
import pickle

In [56]:
with open('models/model_pkl', 'wb') as files:
    pickle.dump(model_rf, files)

In [None]:
train, test = train_test_split(data, test_size=0.15, shuffle=True, random_state=42)
X_train = train.drop("cost", axis=1)
X_test = test.drop("cost", axis=1)
y_train = train["cost"]
y_test = test["cost"]
X_train, binary_encoder, scaler = preprocess_train(X_train)

In [63]:
with open('models/binary_encoder', 'wb') as files:
    pickle.dump(binary_encoder, files)
with open('models/scaler', 'wb') as files:
    pickle.dump(scaler, files)