In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [2]:
training_data = pd.read_csv('training_v5.csv')
training_data

Unnamed: 0,shop_id,date_block_num,year,month,item_id,item_category_id,item_cnt_mnthly_mean,item_cnt_prev_3_mnth_mean,avg_item_price_3mnth_ago,item_cnt_mnth
0,0,0,2013,1,32,40,8.000000,0.000000,0.000000,6.0
1,0,0,2013,1,33,37,3.000000,0.000000,0.000000,3.0
2,0,0,2013,1,35,40,7.500000,0.000000,0.000000,1.0
3,0,0,2013,1,43,40,1.000000,0.000000,0.000000,1.0
4,0,0,2013,1,51,57,2.500000,0.000000,0.000000,2.0
...,...,...,...,...,...,...,...,...,...,...
1609119,59,33,2015,10,22087,83,6.424242,3.333333,119.000000,6.0
1609120,59,33,2015,10,22088,83,6.187500,5.000000,119.000000,2.0
1609121,59,33,2015,10,22091,83,2.294118,1.000000,59.666667,1.0
1609122,59,33,2015,10,22100,42,1.000000,0.333333,209.666667,1.0


In [3]:
X = training_data.iloc[:,0:9].values
y = training_data.iloc[:,9].values
y = y.clip(0.,)
y = np.log1p(y)
feature_list = [c for c in training_data.columns if c not in 'item_cnt_month']
len(feature_list)

9

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [18]:
rand_forest = RandomForestRegressor(bootstrap=True,n_estimators=200, n_jobs=-1, random_state=42)
rand_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [19]:
importances = list(rand_forest.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, 
                       importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: item_cnt_prev_3_mnth_mean Importance: 0.55
Variable: item_category_id     Importance: 0.11
Variable: item_cnt_mnth        Importance: 0.08
Variable: shop_id              Importance: 0.06
Variable: avg_item_price_3mnth_ago Importance: 0.06
Variable: item_cnt_mnthly_mean Importance: 0.05
Variable: date_block_num       Importance: 0.04
Variable: item_id              Importance: 0.03
Variable: year                 Importance: 0.01


In [20]:
y_pred = rand_forest.predict(X_test)

In [21]:
RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print("RMSE:", RMSE)

RMSE: 0.29285993877409416


# Submission

In [13]:
test_data = pd.read_csv('test_processed_V2.csv')
test_data

Unnamed: 0,ID,shop_id,date_block_num,year,month,item_id,item_category_id,item_cnt_mnthly_mean,item_cnt_prev_3_mnth_mean,avg_item_price_3mnth_ago
0,0,5,34,2015,11,5037,19,1.444444,1.333333,499.500000
1,1,5,34,2015,11,5320,55,0.000000,0.000000,0.000000
2,2,5,34,2015,11,5233,19,2.000000,1.666667,799.000000
3,3,5,34,2015,11,5232,23,1.000000,0.333333,199.666667
4,4,5,34,2015,11,5268,20,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,34,2015,11,18454,55,1.800000,0.333333,33.000000
214196,214196,45,34,2015,11,16188,64,0.000000,0.000000,0.000000
214197,214197,45,34,2015,11,15757,55,1.166667,0.000000,0.000000
214198,214198,45,34,2015,11,19648,40,0.000000,0.000000,0.000000


In [14]:
input_features = test_data.iloc[:,1:10].values

In [15]:
test_data['item_cnt_month'] = rand_forest.predict(input_features)

In [16]:
test_data[['ID','item_cnt_month']].to_csv('my_submission_v2.csv', index=False)

In [17]:
test_data

Unnamed: 0,ID,shop_id,date_block_num,year,month,item_id,item_category_id,item_cnt_mnthly_mean,item_cnt_prev_3_mnth_mean,avg_item_price_3mnth_ago,item_cnt_month
0,0,5,34,2015,11,5037,19,1.444444,1.333333,499.500000,0.837145
1,1,5,34,2015,11,5320,55,0.000000,0.000000,0.000000,0.214876
2,2,5,34,2015,11,5233,19,2.000000,1.666667,799.000000,0.774761
3,3,5,34,2015,11,5232,23,1.000000,0.333333,199.666667,0.693147
4,4,5,34,2015,11,5268,20,0.000000,0.000000,0.000000,0.256464
...,...,...,...,...,...,...,...,...,...,...,...
214195,214195,45,34,2015,11,18454,55,1.800000,0.333333,33.000000,0.766131
214196,214196,45,34,2015,11,16188,64,0.000000,0.000000,0.000000,0.055452
214197,214197,45,34,2015,11,15757,55,1.166667,0.000000,0.000000,0.713420
214198,214198,45,34,2015,11,19648,40,0.000000,0.000000,0.000000,0.090109
