In [None]:
import numpy as np
import pandas as pd

from statistics import mean

import matplotlib as pyplot
import seaborn as sns

import matplotlib.pyplot as plt; plt.rcdefaults()
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score

from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

In [None]:
# This function returns the count for each different value
# of a given value. it is used to create more information
def getCountVar(d, column):
    group = d.groupby(column)
    summary = group[column].count()
    countList = summary[d[column]]
    
    return countList.values

# This function returns the purchases average for
# each distinct value of column
def getPurchasesAvg(d, column):
    group = d.groupby(column)
    summary = group['Purchase'].agg('mean')
    sumList = summary[d[column]]

    return sumList.values


In [None]:
d = pd.read_csv('./data.csv')

d.head() #see first 5 rows
d.info() #look at data and observe that prod_category 2 & 3 contain null values


In [None]:
#___________________Fix Raw Data & remove null values_________________

#from data.info we saw that prod_category 3 contains 70% null values
# -> therefor we drop product CATEGORY 3
d = d.drop(['Product_Category_3'], axis=1)

#Prod category 2 only contains 30% of null values
# -> we fill up this column with (insserting) a mean value in column for all missing fields
d['Product_Category_2'].fillna((d['Product_Category_2'].mean()), inplace=True)

#check data updated (all was successfull)
d.info()

In [None]:
# Encode non-numeric columns (One Hot Encoding)
columns_to_encode = [
    'Gender', 
    'Age', 
    'Occupation', 
    'City_Category',
    'Stay_In_Current_City_Years',
]

d = pd.get_dummies(
    d, 
    columns=columns_to_encode, 
    dummy_na=True, 
    drop_first=True)

# The Product_ID has to be encoded because is not numerical   
enc = OrdinalEncoder()
d['Product_ID'] = enc.fit_transform(pd.DataFrame(d['Product_ID']))


In [None]:
 # We try creating some new columns 
# This new variables gives a big improvement
d['Product_ID_Count'] = getCountVar(d, 'Product_ID')
d['Product_ID_Avg'] = getPurchasesAvg(d, 'Product_ID')
d['User_ID_Count'] = getCountVar(d, 'User_ID')
d['User_ID_Avg'] = getPurchasesAvg(d, 'User_ID')

#d = d.drop('Product_ID', axis=1)
#d = d.drop('User_ID', axis=1)

In [None]:
y = d['Purchase']
X = d.drop('Purchase', axis=1)

#m = linear_model.LinearRegression()
m = DecisionTreeRegressor(max_depth=10, min_samples_leaf=200)

scores = cross_val_score(m, X, y, scoring='neg_mean_squared_error', cv=5)
#scores = cross_val_score(m, X, y, scoring='neg_mean_absolute_percentage_error', cv=5)
print(mean(np.sqrt(-1 * scores)))
#print(scores)
#print(mean(scores))
