In [5]:
# Loading Data
import pandas as pd
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [2]:
train.head()

Unnamed: 0,id,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost
0,0,8.61,3.0,2.0,2.0,2.0,10.3,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0,62.09
1,1,5.0,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0,121.8
2,2,14.08,4.0,0.0,0.0,3.0,21.3,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0,83.51
3,3,4.02,3.0,5.0,0.0,0.0,14.8,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0,66.78
4,4,2.13,3.0,5.0,0.0,3.0,17.0,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0,111.51


In [6]:
x_test_un_split = train.iloc[:, 1:-1]
x_test_un_split.head()

Unnamed: 0,store_sales(in millions),unit_sales(in millions),total_children,num_children_at_home,avg_cars_at home(approx).1,gross_weight,recyclable_package,low_fat,units_per_case,store_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist
0,8.61,3.0,2.0,2.0,2.0,10.3,1.0,0.0,32.0,36509.0,0.0,0.0,0.0,0.0,0.0
1,5.0,2.0,4.0,0.0,3.0,6.66,1.0,0.0,1.0,28206.0,1.0,0.0,0.0,0.0,0.0
2,14.08,4.0,0.0,0.0,3.0,21.3,1.0,0.0,26.0,21215.0,1.0,0.0,0.0,0.0,0.0
3,4.02,3.0,5.0,0.0,0.0,14.8,0.0,1.0,36.0,21215.0,1.0,0.0,0.0,0.0,0.0
4,2.13,3.0,5.0,0.0,3.0,17.0,1.0,1.0,20.0,27694.0,1.0,1.0,1.0,1.0,1.0


In [7]:
y_train_un_split = train["cost"]

In [5]:
y_train_un_split.head()

0     62.09
1    121.80
2     83.51
3     66.78
4    111.51
Name: cost, dtype: float64

# Handling Missing Values

In [6]:
# Check for missing values 
train.isnull().sum()

id                            0
store_sales(in millions)      0
unit_sales(in millions)       0
total_children                0
num_children_at_home          0
avg_cars_at home(approx).1    0
gross_weight                  0
recyclable_package            0
low_fat                       0
units_per_case                0
store_sqft                    0
coffee_bar                    0
video_store                   0
salad_bar                     0
prepared_food                 0
florist                       0
cost                          0
dtype: int64

No missing values.

# Train test split

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_test_un_split, y_train_un_split, test_size=0.33, random_state=42)

# Modelling

## Simple KNN Model

In [13]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

def train_knn(x_train, y_train, x_test, y_test, k):
  knn_regressor = KNeighborsRegressor(n_neighbors=k)
  scores = cross_val_score(knn_regressor, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
  print("Mean Squared Error of cross validation: ", scores.mean())
  knn_regressor.fit(x_train, y_train)
  y_pred = knn_regressor.predict(x_test)
  print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
  

In [11]:
train_knn(x_train, y_train, x_valid, y_valid, 13)

Mean Squared Error of cross validation:  -899.0341088710829
Mean Squared Error:  897.1487814548736


## XGBoost Model

In [14]:
from xgboost import XGBRegressor

def train_xgb(x_train, y_train, x_test, y_test):
  xgb_regressor = XGBRegressor(tree_method='gpu_hist')
  scores = cross_val_score(xgb_regressor, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
  print("Mean Squared Error of cross validation: ", scores.mean())
  xgb_regressor.fit(x_train, y_train)
  y_pred = xgb_regressor.predict(x_test)
  print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))

In [21]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

In [14]:
train_xgb(x_train, y_train, x_valid, y_valid)

Mean Squared Error of cross validation:  -806.6147011510297
Mean Squared Error:  798.5898291308797


## CatBoost Model

In [15]:
from catboost import CatBoostRegressor

In [16]:
def train_cat(x_train, y_train, x_test, y_test):
  cat_regressor = CatBoostRegressor(task_type='GPU')
  scores = cross_val_score(cat_regressor, x_train, y_train, cv=5, scoring='neg_mean_squared_error')
  print("Mean Squared Error of cross validation: ", scores.mean())
  cat_regressor.fit(x_train, y_train)
  y_pred = cat_regressor.predict(x_test)
  print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))

In [17]:
train_cat(x_train, y_train, x_valid, y_valid)

Learning rate set to 0.080494
0:	learn: 29.8287959	total: 32.4ms	remaining: 32.3s
1:	learn: 29.7397918	total: 77.3ms	remaining: 38.6s
2:	learn: 29.6557236	total: 113ms	remaining: 37.5s
3:	learn: 29.5790508	total: 140ms	remaining: 34.7s
4:	learn: 29.5170389	total: 167ms	remaining: 33.3s
5:	learn: 29.4608696	total: 197ms	remaining: 32.6s
6:	learn: 29.4153560	total: 222ms	remaining: 31.4s
7:	learn: 29.3763262	total: 249ms	remaining: 30.9s
8:	learn: 29.3405989	total: 276ms	remaining: 30.4s
9:	learn: 29.3087982	total: 304ms	remaining: 30s
10:	learn: 29.2769106	total: 335ms	remaining: 30.1s
11:	learn: 29.2538135	total: 361ms	remaining: 29.7s
12:	learn: 29.2307407	total: 389ms	remaining: 29.6s
13:	learn: 29.2114046	total: 415ms	remaining: 29.2s
14:	learn: 29.1956184	total: 444ms	remaining: 29.1s
15:	learn: 29.1751006	total: 471ms	remaining: 29s
16:	learn: 29.1603658	total: 499ms	remaining: 28.8s
17:	learn: 29.1417790	total: 523ms	remaining: 28.5s
18:	learn: 29.1252625	total: 557ms	remaining: 

Mean Squared Error of cross validation:  -806.948423545891

Mean Squared Error:  801.2314472178098

In [18]:
# Using cat boost model to predict
cat_regressor = CatBoostRegressor(task_type='GPU')
cat_regressor.fit(x_test_un_split, y_train_un_split)
y_pred = cat_regressor.predict(test.iloc[:, 1:])

Learning rate set to 0.087346
0:	learn: 29.8144924	total: 36.1ms	remaining: 36.1s
1:	learn: 29.7127419	total: 71.2ms	remaining: 35.5s
2:	learn: 29.6169798	total: 106ms	remaining: 35.2s
3:	learn: 29.5369877	total: 142ms	remaining: 35.4s
4:	learn: 29.4701655	total: 178ms	remaining: 35.4s
5:	learn: 29.4166633	total: 222ms	remaining: 36.8s
6:	learn: 29.3726435	total: 261ms	remaining: 37s
7:	learn: 29.3335624	total: 310ms	remaining: 38.5s
8:	learn: 29.2979591	total: 355ms	remaining: 39.1s
9:	learn: 29.2611411	total: 397ms	remaining: 39.3s
10:	learn: 29.2325637	total: 450ms	remaining: 40.4s
11:	learn: 29.2063469	total: 535ms	remaining: 44.1s
12:	learn: 29.1820801	total: 713ms	remaining: 54.1s
13:	learn: 29.1625410	total: 785ms	remaining: 55.3s
14:	learn: 29.1448354	total: 839ms	remaining: 55.1s
15:	learn: 29.1254527	total: 894ms	remaining: 55s
16:	learn: 29.1115501	total: 938ms	remaining: 54.3s
17:	learn: 29.0973006	total: 992ms	remaining: 54.1s
18:	learn: 29.0867844	total: 1.04s	remaining: 

In [19]:
# Write the predictions to a csv file with two columns: id and cost
submission = pd.DataFrame({'id':test['id'], 'cost':y_pred})
submission.to_csv('results/submission_01.csv', index=False)

# Feature Selection with PCA

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95, svd_solver = 'full')
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test_un_split)
x_valid_pca = pca.transform(x_valid)

In [19]:
# KNN Model
train_knn(x_train_pca, y_train, x_valid_pca, y_valid, 13)

Mean Squared Error of cross validation:  -911.7936949572565
Mean Squared Error:  910.0872677749293


In [22]:
# XGBoost Model
train_xgb(x_train_pca, y_train, x_valid_pca, y_valid)

Mean Squared Error of cross validation:  -846.9606274969035
Mean Squared Error:  845.4218069925455


In [23]:
# CatBoost Model
train_cat(x_train_pca, y_train, x_valid_pca, y_valid)

Learning rate set to 0.080494
0:	learn: 29.8383217	total: 27.6ms	remaining: 27.6s
1:	learn: 29.7506130	total: 52ms	remaining: 25.9s
2:	learn: 29.6807856	total: 76.8ms	remaining: 25.5s
3:	learn: 29.6215500	total: 101ms	remaining: 25.3s
4:	learn: 29.5638139	total: 124ms	remaining: 24.7s
5:	learn: 29.5206129	total: 150ms	remaining: 24.8s
6:	learn: 29.4759278	total: 171ms	remaining: 24.3s
7:	learn: 29.4408350	total: 192ms	remaining: 23.8s
8:	learn: 29.4114003	total: 218ms	remaining: 24s
9:	learn: 29.3814511	total: 240ms	remaining: 23.7s
10:	learn: 29.3557372	total: 262ms	remaining: 23.5s
11:	learn: 29.3360161	total: 284ms	remaining: 23.4s
12:	learn: 29.3162395	total: 306ms	remaining: 23.2s
13:	learn: 29.2995781	total: 329ms	remaining: 23.1s
14:	learn: 29.2868026	total: 354ms	remaining: 23.3s
15:	learn: 29.2723278	total: 381ms	remaining: 23.4s
16:	learn: 29.2600839	total: 404ms	remaining: 23.4s
17:	learn: 29.2469682	total: 426ms	remaining: 23.2s
18:	learn: 29.2389725	total: 450ms	remaining:

Mean Squared Error of cross validation:  -846.677319247969

Mean Squared Error:  845.3226852187273

In [24]:
pca = PCA(n_components = 0.95, svd_solver = 'full')
x_train_pca = pca.fit_transform(x_test_un_split)
x_test_pca = pca.transform(test.iloc[:, 1:])

In [26]:
# Using cat boost model to predict
cat_regressor = CatBoostRegressor(task_type='GPU')
cat_regressor.fit(x_train_pca, y_train_un_split)
y_pred = cat_regressor.predict(x_test_pca)

Learning rate set to 0.087346
0:	learn: 29.8370796	total: 62.5ms	remaining: 1m 2s
1:	learn: 29.7379374	total: 96ms	remaining: 47.9s
2:	learn: 29.6550944	total: 129ms	remaining: 42.9s
3:	learn: 29.5914952	total: 163ms	remaining: 40.5s
4:	learn: 29.5366690	total: 196ms	remaining: 39.1s
5:	learn: 29.4910561	total: 229ms	remaining: 38s
6:	learn: 29.4441859	total: 262ms	remaining: 37.2s
7:	learn: 29.4100074	total: 294ms	remaining: 36.5s
8:	learn: 29.3816882	total: 326ms	remaining: 35.9s
9:	learn: 29.3510226	total: 360ms	remaining: 35.6s
10:	learn: 29.3251070	total: 394ms	remaining: 35.4s
11:	learn: 29.3057390	total: 426ms	remaining: 35.1s
12:	learn: 29.2864204	total: 457ms	remaining: 34.7s
13:	learn: 29.2694981	total: 488ms	remaining: 34.4s
14:	learn: 29.2580923	total: 519ms	remaining: 34.1s
15:	learn: 29.2472605	total: 549ms	remaining: 33.8s
16:	learn: 29.2333794	total: 582ms	remaining: 33.7s
17:	learn: 29.2197683	total: 616ms	remaining: 33.6s
18:	learn: 29.2086866	total: 648ms	remaining: 

In [28]:
submission = pd.DataFrame({'id':test['id'], 'cost':y_pred})
submission.to_csv('results/submission_02.csv', index=False)

Original model without PCA is better.