In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error

import time

from sklearn.ensemble import RandomForestRegressor


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

## LabelEncoder + handle unknowns

This is an extended version of sklearn.preprocessing.LabelEncoder class

In [None]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

## Train dataset

This is the train set - input for our model to be trained

In [None]:
df = pd.read_csv(r'/kaggle/input/black-friday-sales-prediction/train.csv')
df.head()

In [None]:
df2 = df.copy()

## Data Pre-processing

Processing and cleaning the data before using it for training the model.

1. Data Imputation
2. Data Cleaning
3. One-Hot encoding - using pd.get_dummies()


## Handling Missing Value

IterativeImputer is an experimental feature in sklearn module.
It studies the other columns in the dataset and intelligently populates the missing values.
This is a smarter way to fill the missing values. Instead of filling with a single value in all the empty cells, this is a better approach to fill in considering various proportions.

In [None]:
ii = IterativeImputer(random_state=0)

In [None]:
def DataCleaning(df):
    df['Product_Category_2'].fillna(0,inplace = True)
    df['Product_Category_3'].fillna(0,inplace = True)
    df['Product_Category_2'] = df['Product_Category_2'].astype(int) 
    df['Product_Category_3'] = df['Product_Category_3'].astype(int) 

    df['Gender'] = np.where(df['Gender']=='M', 1, 0)

    df = pd.get_dummies(df, columns=['Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years',
                                     'Product_Category_1', 'Product_Category_2', 'Product_Category_3'])
    
    return df

In [None]:
df = DataCleaning(df)

In [None]:
df.columns

## Handling Product_Category_x fields

### Some of my Observations

* If you select a specific Product_ID and look up for its other occurrences in the dataset, you will notice each of the rows will have the same values in Product_Category_1, 2, and 3.
* Product_Category_1 will be filled first. Only if Product_Category_1 is filled and there's need for more room, Product_Category_2 is used. Same for Product_Category_3. In other words, Product_Category_3 will never be filled keeping Product_Category_2 or Product_Category_1 empty.
* These values here, are masked and represented numerically since we do not need to know the exact values. 

### Understanding through an analogy

Let's look at a scenario where there's a User Details Dataset, and in the contact details section.
- Many users will have only 1 mobile number. The next 2 fields shall be kept null.
- However, if required, a single user-id can have more than one mobile numbers. That's where the next 2 fields come into picture.
- And, Mobile_Number_3 will only be used if Mobile_Number_2 field is already populated.


In [None]:
df['Product_Category_2_1'] = 0
df['Product_Category_2_19'] = 0
df['Product_Category_2_20'] = 0

df['Product_Category_3_1'] = 0
df['Product_Category_3_2'] = 0
df['Product_Category_3_7'] = 0
df['Product_Category_3_19'] = 0
df['Product_Category_3_20'] = 0


df = df.drop('Product_Category_2_0', axis=1)
df = df.drop('Product_Category_3_0', axis=1)

In [None]:
myL = ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20')

In [None]:
myL2 = ['Product_Category_1_1', 'Product_Category_1_2', 'Product_Category_1_3', 
        'Product_Category_1_4', 'Product_Category_1_5', 'Product_Category_1_6',
        'Product_Category_1_7', 'Product_Category_1_8', 'Product_Category_1_9', 
        'Product_Category_1_10', 'Product_Category_1_11', 'Product_Category_1_12',
        'Product_Category_1_13', 'Product_Category_1_14', 'Product_Category_1_15',
        'Product_Category_1_16', 'Product_Category_1_17', 'Product_Category_1_18',
        'Product_Category_1_19', 'Product_Category_1_20', 'Product_Category_2_1',
        'Product_Category_2_2', 'Product_Category_2_3', 'Product_Category_2_4',
        'Product_Category_2_5', 'Product_Category_2_6', 'Product_Category_2_7',
        'Product_Category_2_8', 'Product_Category_2_9', 'Product_Category_2_10',
        'Product_Category_2_11', 'Product_Category_2_12', 'Product_Category_2_13',
        'Product_Category_2_14', 'Product_Category_2_15', 'Product_Category_2_16',
        'Product_Category_2_17', 'Product_Category_2_18', 'Product_Category_2_19',
        'Product_Category_2_20', 'Product_Category_3_1', 'Product_Category_3_2',
        'Product_Category_3_3', 'Product_Category_3_4', 'Product_Category_3_5',
        'Product_Category_3_6', 'Product_Category_3_7', 'Product_Category_3_8',
        'Product_Category_3_9', 'Product_Category_3_10', 'Product_Category_3_11',
        'Product_Category_3_12', 'Product_Category_3_13', 'Product_Category_3_14',
        'Product_Category_3_15', 'Product_Category_3_16', 'Product_Category_3_17',
        'Product_Category_3_18', 'Product_Category_3_19', 'Product_Category_3_20']

In [None]:
df.columns

In [None]:
df4=pd.DataFrame()

In [None]:
for i in myL:
    df4['Product_Category_'+i] = df['Product_Category_1_'+i] + df['Product_Category_2_'+i] + df['Product_Category_3_'+i]

In [None]:
df4

In [None]:
df = df.drop(myL2, axis=1)
df

In [None]:
df = df.merge(df4, left_index=True, right_index=True)
df

In [None]:
df.info()

In [None]:
df.columns

In [None]:
label = LabelEncoderExt()  
label.fit(df['Product_ID'])
df['Product_ID'] = label.transform(df['Product_ID'])

In [None]:
df.head()

## Heatmap

... to show correlation factor

In [None]:
plt.figure(figsize = ( 20 , 15 )) 
sns.heatmap(df.corr(), cmap='cubehelix')

## Train-Test split

In [None]:
y = df.Purchase.values

features = ['User_ID', 'Product_ID', 'Gender', 'Marital_Status', 'Age_0-17',
            'Age_18-25', 'Age_26-35', 'Age_36-45', 'Age_46-50', 'Age_51-55', 'Age_55+',
            'Occupation_0', 'Occupation_1', 'Occupation_2', 'Occupation_3', 'Occupation_4',
            'Occupation_5', 'Occupation_6', 'Occupation_7', 'Occupation_8', 'Occupation_9', 'Occupation_10',
            'Occupation_11', 'Occupation_12', 'Occupation_13', 'Occupation_14', 'Occupation_15', 
            'Occupation_16', 'Occupation_17', 'Occupation_18', 'Occupation_19', 'Occupation_20',
            'City_Category_A', 'City_Category_B', 'City_Category_C',
            'Stay_In_Current_City_Years_0', 'Stay_In_Current_City_Years_1', 'Stay_In_Current_City_Years_2',
            'Stay_In_Current_City_Years_3', 'Stay_In_Current_City_Years_4+',
            'Product_Category_1', 'Product_Category_2', 'Product_Category_3', 'Product_Category_4',
            'Product_Category_5', 'Product_Category_6', 'Product_Category_7', 'Product_Category_8',
            'Product_Category_9', 'Product_Category_10', 'Product_Category_11', 'Product_Category_12',
            'Product_Category_13', 'Product_Category_14', 'Product_Category_15', 'Product_Category_16',
            'Product_Category_17', 'Product_Category_18', 'Product_Category_19', 'Product_Category_20']

X = df[features].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Scaling Down

Using sklearn.preprocessing.StandardScaler to reduce the data items to smaller numeric values which in turn helps in conducting faster calculations for huge matrices.

In [None]:
sc = StandardScaler()
scaler = sc.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## RandomizedSearchCV

Using RandomizedSearchCV to effectively tune the hyperparamets and choose the best estimator configuration.

In [None]:
# Number of trees in random forest
n_estimators = [500, 800, 1300]

# Number of features to consider at every split
# max_features = ['auto', 'sqrt', 80] # auto is best

# Maximum number of levels in tree
# max_leaf_nodes = [100, 1000, 2000, 5000]

# Minimum number of samples required to split a node
min_samples_split = [40, 60, 100]

# # Minimum number of samples required at each leaf node
min_samples_leaf = [8, 10, 15]

# # Method of selecting samples for training each tree
# bootstrap = [True, False]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

print(random_grid)

#### Base Estimator configuration

In [None]:
rfr = RandomForestRegressor(random_state=42, verbose=True,
                                     max_leaf_nodes=5000, 
#                                      min_samples_leaf=4, bootstrap=True, min_samples_split=15, 
#                                      max_depth=50, max_features=90, min_samples_split=2,
#                                      n_estimators=400, max_features='auto',
                                     n_jobs=4)

In [None]:
start_time = time.time()

CV_rfr = RandomizedSearchCV(estimator=rfr, param_distributions=random_grid, cv=2)
CV_rfr.fit(X_train, y_train)

print("--- %s min ---" % ((time.time() - start_time)/60))

In [None]:
print("best_estimator_", CV_rfr.best_estimator_) 
# print("best_index_", CV_rfr.best_index_) 
print("best_params_", CV_rfr.best_params_) 
# print("cv_results_", CV_rfr.cv_results_) 
print("get_params", CV_rfr.get_params) 
# print("n_features_in_", CV_rfr.n_features_in_) 
# print("n_splits_", CV_rfr.n_splits_) 

In [None]:
rfr = CV_rfr.best_estimator_

# Below is the best estimator achieved so far. 
# Feel free to throw in some comments if you think you can help us improve! 
# Would love to hear from you all.

# rfr = RandomForestRegressor(max_leaf_nodes=5000, min_samples_split=60,
#                             n_estimators=1300, n_jobs=4, random_state=42,
#                             verbose=True)


In [None]:
start_time = time.time()
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print(mean_squared_error(y_test, y_pred, squared=False))
print("--- %s seconds ---" % (time.time() - start_time))

## Test dataset

reading the Test dataset and preprocessing it in similar steps as the train set

In [None]:
test=pd.read_csv(r'../input/black-friday-sales-prediction/test.csv')
test.head()

In [None]:
test = DataCleaning(test)

In [None]:
test['Product_ID'] = label.transform(test['Product_ID']) 

In [None]:
test.info()

In [None]:
test.columns

#### Handling Product_Category_x fields

In [None]:
test['Product_Category_1_19'] = 0
test['Product_Category_1_20'] = 0

test['Product_Category_2_1'] = 0
test['Product_Category_2_19'] = 0
test['Product_Category_2_20'] = 0

test['Product_Category_3_1'] = 0
test['Product_Category_3_2'] = 0
test['Product_Category_3_7'] = 0
test['Product_Category_3_19'] = 0
test['Product_Category_3_20'] = 0


test = test.drop('Product_Category_2_0', axis=1)
test = test.drop('Product_Category_3_0', axis=1)

In [None]:
df4=pd.DataFrame()

In [None]:
for i in myL:
    df4['Product_Category_'+i] = test['Product_Category_1_'+i] + test['Product_Category_2_'+i] + test['Product_Category_3_'+i]

In [None]:
test = test.drop(myL2, axis=1)
test

In [None]:
test = test.merge(df4, left_index=True, right_index=True)
test

In [None]:
test.info()

In [None]:
test[features]

### Scaling down

scaling down the test dataset

In [None]:
test2 = test[features].values
test2 = scaler.transform(test)

## Predicting

Predicting Purchase values for Test dataset

In [None]:
output=pd.read_csv(r'../input/black-friday-sales-prediction/test.csv',usecols=['User_ID','Product_ID'])
output['Purchase']=rfr.predict(test2)
output2 = output[['Purchase','User_ID','Product_ID']]
output2.to_csv('submission.csv',index=False)

In [None]:
os.listdir()

### submission.csv is ready!

So far, we've achieved 2679 error (RMSE) on test dataset.
We appreciate if you can share your comments and help us improve on our model.

