In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df = pd.concat([df_train, df_test])

df.drop(columns=['User_ID', 'Product_ID'], inplace=True)

df.fillna(0, inplace=True)
print(df.isna().sum())

df.drop(columns=['Stay_In_Current_City_Years', 'Marital_Status'], inplace=True)
print(df.shape)

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64
(783667, 8)


In [2]:
train_length = len(df_train)
print(train_length)

real_y = df['Purchase']
real_y = np.array(real_y)
real_y = real_y[0:train_length]
print(real_y.shape)

df.drop(columns=['Purchase'], inplace=True)
print(df.shape)

550068
(550068,)
(783667, 7)


In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

column_trans = make_column_transformer((OneHotEncoder(), ['Age','Gender','Occupation','City_Category','Product_Category_1','Product_Category_2','Product_Category_3']), remainder='passthrough')
real_x = column_trans.fit_transform(df)

print(real_x.shape)

real_x = real_x.toarray()

print(real_x)

(783667, 87)
[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
real_y = real_y.reshape(-1,1)
real_y = sc.fit_transform(real_y)
print(real_y)

test_x = real_x[train_length:,:]
real_x = real_x[0:train_length,:]

[[-0.1779729 ]
 [ 1.1817558 ]
 [-1.56119326]
 ...
 [-1.81701338]
 [-1.77162273]
 [-1.7467375 ]]


In [5]:
from sklearn.model_selection import train_test_split

train_x, cv_x, train_y, cv_y = train_test_split(real_x, real_y, test_size=0.2, random_state=0)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
from xgboost import XGBRegressor

train_y = train_y.reshape(train_y.shape[0],)
cv_y = cv_y.reshape(cv_y.shape[0],)

In [7]:
'''model = RandomForestRegressor(n_estimators=3000, max_depth=6, oob_score=True, n_jobs=6, random_state=0, min_samples_split=10, min_samples_leaf=10)
model.fit(train_x, train_y)'''

'model = RandomForestRegressor(n_estimators=3000, max_depth=6, oob_score=True, n_jobs=6, random_state=0, min_samples_split=10, min_samples_leaf=10)\nmodel.fit(train_x, train_y)'

In [8]:
'''model = ExtraTreesRegressor(n_estimators=3000, 
                              max_depth=8,
                              min_samples_split=10, 
                              min_samples_leaf=10, 
                              oob_score=True, 
                              n_jobs=6, 
                              random_state=123, 
                              verbose=1, 
                              bootstrap=True)
model.fit(train_x, train_y)'''

'model = ExtraTreesRegressor(n_estimators=3000, \n                              max_depth=8,\n                              min_samples_split=10, \n                              min_samples_leaf=10, \n                              oob_score=True, \n                              n_jobs=6, \n                              random_state=123, \n                              verbose=1, \n                              bootstrap=True)\nmodel.fit(train_x, train_y)'

In [9]:
model = XGBRegressor()
model.fit(train_x,train_y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [10]:
# hyper parameters

n_estimators = [100,500,900,1100,1500]
max_depth = [2,3,5,10,15]
booster = ['gbtree','gblinear']
learning_rate = [0.05,0.1,0.15,0.2]
min_child_weight = [1,2,3,4]

In [23]:
hyperparameter_grid = {
    'n_estimators' : n_estimators,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate,
    'min_child_weight' : min_child_weight,
    'booster' : booster
}

In [24]:
from sklearn.model_selection import RandomizedSearchCV

random_cv = RandomizedSearchCV(estimator=model,
                          param_distributions=hyperparameter_grid,
                          cv=5, n_iter=50,
                          scoring='neg_mean_absolute_error', n_jobs=4,
                          return_train_score=True,
                          verbose=5,random_state=0)

In [None]:
random_cv.fit(real_x,real_y)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
random_cv.best_estimator_

In [None]:
pred_y = model.predict(cv_x)

In [None]:
pred_y = sc.inverse_transform(pred_y)
act_y = sc.inverse_transform(cv_y)

In [None]:
from sklearn.metrics import mean_squared_error

error = np.sqrt(mean_squared_error(act_y, pred_y))

In [None]:
print(error)

In [None]:
pred_y

In [None]:
act_y

In [None]:
predictions = model.predict(test_x)
predictions = sc.inverse_transform(predictions)

In [None]:
submit = pd.read_csv("sample_submission_V9Inaty.csv")
submit["Purchase"] = predictions
submit

In [None]:
submit.to_csv("Submission_2.csv")