# **Walmart: Trip Type Classification**

# Importing files from google drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"1907ZpGGcxQx-ddQMk92E_dWZhSwNX9cH"})   # replace the id with id of file you want to access
downloaded.GetContentFile('train.csv')        # replace the file name with your file

In [None]:
downloaded = drive.CreateFile({'id':"1T8OerWn18N0lL1Oo1Gb_ayjbWX01Z-Sg"})   # replace the id with id of file you want to access
downloaded.GetContentFile('test.csv')        # replace the file name with your file

In [None]:
downloaded = drive.CreateFile({'id':"1ugSoizEczwmNhxwB4AeKHp8YH5lvsqR4"})   # replace the id with id of file you want to access
downloaded.GetContentFile('sample_submission.csv')        # replace the file name with your file

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# DATA

### Reading data

In [None]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

## Data Description

### **Problem statement**: categorizing shopping trip types based on the items that customers purchased. 
To give a few hypothetical examples of trip types: a customer may make a small daily dinner trip, a weekly large grocery trip, a trip to buy gifts for an upcoming holiday, or a seasonal trip to buy clothes.

Walmart has categorized the trips contained in this data into 38 distinct types using a proprietary method applied to an extended set of data. You are challenged to recreate this categorization/clustering with a more limited set of features. This could provide new and more robust ways to categorize trips.

The training set (train.csv) contains a large number of customer visits with the TripType included. You must predict the TripType for each customer visit in the test set (test.csv). Each visit may only have one TripType. You will not be provided with more information than what is given in the data (e.g. what the TripTypes represent or more product information).

### **Data fields**

#### *TripType* - a categorical id representing the type of shopping trip the customer made. This is the ground truth that you are predicting. TripType_999 is an "other" category.
#### *VisitNumber* - an id corresponding to a single trip by a single customer
#### Weekday - the weekday of the trip
#### *Upc* - the UPC number of the product purchased
#### *ScanCount* - the number of the given item that was purchased. A negative value indicates a product return.
#### *DepartmentDescription* - a high-level description of the item's department
#### *FinelineNumber* - a more refined category for each of the products, created by Walmart

# Exploring Data

## Checking for duplication

## Deduplication

In [None]:
train_data=train_data.drop_duplicates()

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

# Dealing with missing values

**Missing values in FinelineNumber are most. filling them with most frequent value in that column.**

In [None]:
train_data.FinelineNumber.value_counts()

In [None]:
train_data['FinelineNumber']=train_data['FinelineNumber'].fillna(8228)

In [None]:
train_data.info()

**Droping rest of the rows with missing values, which are very few compared to total size of dataset**

In [None]:
train_data.dropna(inplace=True)

In [None]:
train_data.shape

In [None]:
train_data.head()

In [None]:
sns.heatmap(train_data.isna())

## Univariate analysis

### TripType

In [None]:
train_data.TripType.nunique()

38 different class labels 

In [None]:
train_data.TripType.unique()

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(train_data.TripType)

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train_data.TripType,hist=False)

**Imbalanced data**. distribution of target variable is right skewed. there are too many classes for which there are very few datapoints
TripType 36 to 40 have most datapoints.

### VisitNumber

In [None]:
train_data.VisitNumber.nunique()

In [None]:
train_data.shape[0]

VisitNumber is a categorical variable

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train_data.VisitNumber)

***VisitNumber shows approximately uniform distribution***

In [None]:
train_data.head()

### Weekday distribution 

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(train_data.Weekday)

In [None]:
train_data.groupby('Weekday')['Weekday'].count()

In [None]:
def weekday_to_num(x):
  if x=='Monday':
    return 0
  elif x=='Tuesday':
    return 1
  elif x=='Wednesday':
    return 2
  elif x=='Thursday':
    return 3
  elif x=='Friday':
    return 4
  elif x=='Saturday':
    return 5
  elif x=='Sunday':
    return 6 

In [None]:
train_data['weekday_num']=train_data.Weekday.apply(weekday_to_num)

In [None]:
train_data.head()

In [None]:
train_data.Weekday.unique()

In [None]:
fig,axs=plt.subplots(figsize=(10,6))
sns.distplot(train_data['weekday_num'],ax=axs)
ticks=list(range(0,7))
axs.set_xticks(ticks)
x_tick_label=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
axs.set_xticklabels(x_tick_label)
plt.show()

***As Expected there is more probabilty of visits for weekends. Thursday is the day for lowest number of visits***

### ScanCount

In [None]:
train_data.ScanCount.nunique()

In [None]:
train_data.ScanCount.value_counts()

In [None]:
train_data.min().ScanCount

In [None]:
train_data.max().ScanCount

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(train_data.ScanCount,bins=39)

***ScanCount data is widely distributed between -12 to 71. 71 seems to be an outlier. so to get clearer view of the distribution plottting distribution in two parts***

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train_data[train_data.ScanCount<=10].ScanCount)


In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train_data[train_data.ScanCount>10].ScanCount,bins=4)


In [None]:
plt.figure(figsize=(10,6))
sns.countplot(train_data[train_data.ScanCount<=10].ScanCount)

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(train_data[train_data.ScanCount>10].ScanCount)

***As we can see, there are visits for ScanCount of 1 the most. and as number of ScanCount increases, the count of data points decreases. Also not many data points for returned products, since negative ScanCounts are not much.***

In [None]:
train_data.head()

### DepartmentDescription	

In [None]:
train_data.DepartmentDescription.nunique()	

In [None]:
train_data.DepartmentDescription.value_counts()	

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x='DepartmentDescription',data=train_data)
plt.xticks(rotation=90)
plt.show()

In [None]:
train_data.groupby('DepartmentDescription')['DepartmentDescription'].count()

In [None]:
department_list=list(train_data['DepartmentDescription'].unique())

In [None]:
department_enumerate=list(enumerate(department_list))

In [None]:
department_dict={v:k for k,v in department_enumerate}

In [None]:
department_dict.values()

In [None]:
def department_num(x):
  return department_dict[x]
  

In [None]:
train_data['Department_num']=train_data['DepartmentDescription'].apply(department_num)

In [None]:
train_data.head()

In [None]:
fig,axs=plt.subplots(figsize=(18,8))
sns.distplot(train_data['Department_num'],bins=68,ax=axs)
ticks=list(range(0,69))
axs.set_xticks(ticks)
x_tick_label=train_data.DepartmentDescription.unique()	
axs.set_xticklabels(x_tick_label)
plt.xticks(rotation=90)
plt.show()



Distribution of DepartmentDescription in **skewed**, **PERSONAL CARE, DSD GROCERIES, DAIRY, PRODUCE, GROCERY DRY GOODS** these categories in DepartmentDescription dominate the data.

### FinelineNumber

In [None]:
train_data.nunique().FinelineNumber

Too many categories in FinelineNumber

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train_data.FinelineNumber,hist=False)

we may say that this is a multimodal distribution. certain values appear more frequently than others.

### Upc

In [None]:
train_data.Upc.nunique()

In [None]:
train_data.shape

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(train_data.Upc,hist=False)

In [None]:
train_data.head()

# featurization

### creating a new feature 'num_of_products' which shows number of products purchased for each 'VisitNumber' based on 'Upc' 

In [None]:
# seeing for each VisitNumber how many products were purchased based on Upc number of product purchased
products_per_visit=train_data.groupby(['VisitNumber'])['Upc'].count()

In [None]:
products_per_visit_dict=dict(products_per_visit)

In [None]:
train_data['num_of_products_for_VisitNumber']=train_data['VisitNumber'].apply(lambda x:products_per_visit_dict.get(x,0))

In [None]:
# train_data.drop(columns=['num_of_products'],inplace=True)

In [None]:
train_data.head()

In [None]:
train_data.num_of_products_for_VisitNumber.nunique()

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(train_data.num_of_products_for_VisitNumber,bins=99)

As expected the distribution of number of products purchased for each VisitNumer is right skewed. there is more probability of buying less number of products

Seems like num_of_products purchased shows lognormal distribution, checking for same.

In [None]:
plt.figure(figsize=(12,6))
sns.distplot(np.log(train_data.num_of_products_for_VisitNumber),hist=False)

num_of_products does not show lognormal distribution.

Checking if num_of_products is somehow related to the day on which shopping was done.

In [None]:
train_data.head()

In [None]:
sns.jointplot(y='num_of_products_for_VisitNumber',x='weekday_num',data=train_data)

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x='weekday_num',y='num_of_products_for_VisitNumber',data=train_data)

On weekends and on mondays there seem to be more products bought per VisitNumber

In [None]:
train_data.FinelineNumber.nunique()

In [None]:
train_data.Department_num.unique()

In [None]:
groupby_dept=train_data.groupby(['Department_num'])

fineline_dict is dictionary where key is Depart_num and value is unique number of FinelineNumber in that department 

In [None]:
fineline_dict={}
for i in range(68):
  gr=groupby_dept.get_group(i)
  c=gr['FinelineNumber'].count()
  un=gr['FinelineNumber'].nunique()
  #print(f"group: {i}, unique FinelineNumber: {un}")
  fineline_dict[i]=un



In [None]:
sns.jointplot(y='num_of_products_for_VisitNumber',x='TripType',data=train_data[train_data['TripType']<900])

num_of_products seem to increase as category number of TripType increase.

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='num_of_products_for_VisitNumber',data=train_data)

**We do see a trend here, boxplots for various TripTypes for num_of_products don't fully overlap, hence num_of_products might prove to be an**

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='Department_num',data=train_data)

**boxplots for various TripTypes for Department_num also don't fully overlap, hence num_of_products might prove to be an distinctive feature**

In [None]:
weekday_num_of_products=dict(train_data.groupby('weekday_num')['Upc'].count())

In [None]:
train_data['num_of_products_for_weekday']=train_data['weekday_num'].apply(lambda x:weekday_num_of_products.get(x))

In [None]:
train_data.head()

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='num_of_products_for_weekday',data=train_data)

**'num_of_products_for weekday' does not seem to be much useful. removing it**

In [None]:
train_data.drop(columns=['num_of_products_for_weekday'],inplace=True)

In [None]:
Department_num_of_products=dict(train_data.groupby('Department_num')['Upc'].count())

In [None]:
train_data['num_of_products_for_department']=train_data['Department_num'].apply(lambda x:Department_num_of_products.get(x))

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='num_of_products_for_department',data=train_data)

**'num_of_products_for_department' seems to be useful. keeping it.**

In [None]:
train_data.FinelineNumber.nunique()

In [None]:
Fineline_num_of_products=dict(train_data.groupby('FinelineNumber')['Upc'].count())

In [None]:
train_data['num_of_products_for_fineline']=train_data['FinelineNumber'].apply(lambda x:Fineline_num_of_products.get(x))

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='num_of_products_for_fineline',data=train_data)

**'num_of_products_for_fineline' does not seem to be much useful removing it**

In [None]:
train_data.drop(columns=['num_of_products_for_fineline'],inplace=True)

In [None]:
train_data.head()

In [None]:
train_data.Weekday.value_counts()

In [None]:
train_data.DepartmentDescription.value_counts()

In [None]:
train_data.head()

## **one hot encoding Weekdays**

In [None]:
one_hot_encoded_weekday=pd.get_dummies(train_data['Weekday'],drop_first=False)

In [None]:
one_hot_encoded_weekday.head()

In [None]:
train_data=pd.concat([train_data,one_hot_encoded_weekday],axis=1)

In [None]:
train_data.head()

# Splitting data into X and Y and then into train and cross validation and test sets.

**adding 'FinelineCat' feature**

In [None]:
Y=train_data.TripType

In [None]:
X=train_data.drop(columns=['TripType'])

In [None]:
X['FinelineCat']=pd.cut(X['FinelineNumber'],bins=50,labels=False)

In [None]:
X.head()

In [None]:
Y.head()

In [None]:
train_data.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,stratify=Y,test_size=0.2)
x_train,x_cv,y_train,y_cv=train_test_split(x_train,y_train,stratify=y_train,test_size=0.2)

In [None]:
print(x_train.shape,x_cv.shape,x_test.shape)

In [None]:
print(y_train.shape,y_cv.shape,y_test.shape)

# Response encoding for rest of the rest of the categorical features

### **Creating a new temporary dataframe from x_train and y_train for getting lookup table for response coding of some categorical features**

In [None]:
x_train_res=x_train.copy()

In [None]:
x_train_res['class']=y_train.values

In [None]:
y_train.unique()

In [None]:
x_train_res.head()

### **Functions for response encoding** 

In [None]:
#function to get lookup dictionary based on train data only
from tqdm import tqdm
def get_lookup_dict(alpha,feature):
  value_count = x_train_res[feature].value_counts()
  lookup_dict = dict()
  for i, denominator in tqdm(value_count.items()):
    vec = []
    for k in y_train.unique():
      cls_cnt = x_train_res.loc[(x_train_res['class']==k) & (x_train_res[feature]==i)]
      vec.append((cls_cnt.shape[0] + alpha*10)/ (denominator + 90*alpha))
    
    lookup_dict[i]=vec
  
  return lookup_dict


In [None]:
from tqdm import tqdm
def get_encoded_feature(alpha,feature,df,lookup_dict):
  #lookup_dict=get_lookup_dict(alpha,feature,df)
  value_count = x_train_res[feature].value_counts()
  gv_fea = []
  for index, row in tqdm(df.iterrows()):

    if row[feature] in dict(value_count).keys():
        gv_fea.append(lookup_dict[row[feature]])
    else:
        gv_fea.append([1/38]*38)
#           gv_fea.append([-1,-1,-1,-1,-1,-1,-1,-1,-1])
  
  return gv_fea

### **response encoding 'DepartmentDescription'**

In [None]:
lookup_dict_DD=get_lookup_dict(1,'DepartmentDescription')

In [None]:
alpha = 1
train_department_feature_responseCoding=np.array(get_encoded_feature(alpha, "DepartmentDescription", x_train,lookup_dict_DD))

In [None]:
test_department_feature_responseCoding=np.array(get_encoded_feature(alpha, "DepartmentDescription", x_test,lookup_dict_DD))

In [None]:
cv_department_feature_responseCoding=np.array(get_encoded_feature(alpha, "DepartmentDescription", x_cv,lookup_dict_DD))

In [None]:
train_department_feature_responseCoding.shape

In [None]:
test_department_feature_responseCoding.shape

In [None]:
cv_department_feature_responseCoding.shape

### **response encoding 'ScanCount'**

In [None]:
train_data.ScanCount.nunique()

In [None]:
lookup_dict_SS=get_lookup_dict(1,'ScanCount')

In [None]:
train_ScanCount_feature_responseCoding=np.array(get_encoded_feature(alpha, "ScanCount", x_train,lookup_dict_SS))

In [None]:
test_ScanCount_feature_responseCoding=np.array(get_encoded_feature(alpha, "ScanCount", x_test,lookup_dict_SS))

In [None]:
cv_ScanCount_feature_responseCoding=np.array(get_encoded_feature(alpha, "ScanCount", x_cv,lookup_dict_SS))

In [None]:
train_ScanCount_feature_responseCoding.shape

In [None]:
test_ScanCount_feature_responseCoding.shape

In [None]:
cv_ScanCount_feature_responseCoding.shape

### **response encoding 'FinelineCat'**

In [None]:
train_data.FinelineNumber.nunique()

'FinelineNumber' has too many categories. we will try to discretise it further into small number of categories

In [None]:
train_data.FinelineNumber

In [None]:
train_data['FinelineCat']=pd.cut(train_data['FinelineNumber'],bins=50,labels=False)

In [None]:
plt.figure(figsize=(9,6))
sns.distplot(train_data['FinelineCat'])

we will see if this new feature is useful in classification

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x='TripType',y='FinelineCat',data=train_data)

Not much useful. but we will anyway respose encode this feature

we will need to add this feature to X first

In [None]:
lookup_dict_FNC=get_lookup_dict(1,'FinelineCat')

In [None]:
train_FinelineCat_feature_responseCoding=np.array(get_encoded_feature(alpha, "FinelineCat", x_train,lookup_dict_FNC))

In [None]:
test_FinelineCat_feature_responseCoding=np.array(get_encoded_feature(alpha, "FinelineCat", x_test,lookup_dict_FNC))

In [None]:
cv_FinelineCat_feature_responseCoding=np.array(get_encoded_feature(alpha, "FinelineCat", x_cv,lookup_dict_FNC))

In [None]:
train_FinelineCat_feature_responseCoding.shape

In [None]:
test_FinelineCat_feature_responseCoding.shape

In [None]:
cv_FinelineCat_feature_responseCoding.shape

# Stacking all featurized train, test and cv data and preparing final data.

In [None]:
x_train.shape

In [None]:
x_train.head()

In [None]:
x_train.drop(columns=['Weekday','Upc','DepartmentDescription','FinelineNumber','weekday_num','Department_num','FinelineCat'],inplace=True)

In [None]:
x_train.head()

In [None]:
x_train.values

In [None]:
x_tr=np.hstack((x_train.values,train_ScanCount_feature_responseCoding,train_department_feature_responseCoding,train_FinelineCat_feature_responseCoding,))

In [None]:
x_tr.shape

In [None]:
y_train.shape

In [None]:
x_test.drop(columns=['Weekday','Upc','DepartmentDescription','FinelineNumber','weekday_num','Department_num','FinelineCat'],inplace=True)

In [None]:
x_test.head()

In [None]:
x_te=np.hstack((x_test.values,test_ScanCount_feature_responseCoding,test_department_feature_responseCoding,test_FinelineCat_feature_responseCoding,))

In [None]:
x_cv.drop(columns=['Weekday','Upc','DepartmentDescription','FinelineNumber','weekday_num','Department_num','FinelineCat'],inplace=True)

In [None]:
x_cv=np.hstack((x_cv.values,cv_ScanCount_feature_responseCoding,cv_department_feature_responseCoding,cv_FinelineCat_feature_responseCoding,))

In [None]:
x_cv.shape

In [None]:
y_cv.shape

## saving the three preprocessed and featurized train, test and cv datasets for future importing.

In [None]:
#np.save('x_train_final',x_tr)

In [None]:
#np.save('x_test_final',x_te)

In [None]:
#np.save('x_cv_final',x_cv)

In [None]:
#np.save('y_train_final',y_train.values)

In [None]:
#np.save('y_test_final',y_test.values)

In [None]:
#np.save('y_cv_final',y_cv.values)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## loading .npy files

In [None]:
from scipy import sparse

In [None]:
x_cv=sparse.csr_matrix(np.load('/content/drive/My Drive/x_cv_final.npy'))


In [None]:
x_test=sparse.csr_matrix(np.load('/content/drive/My Drive/x_test_final.npy'))

In [None]:
x_train=sparse.csr_matrix(np.load('/content/drive/My Drive/x_train_final.npy'))

In [None]:
y_cv=np.load('/content/drive/My Drive/y_cv_final.npy')

In [None]:
y_test=np.load('/content/drive/My Drive/y_test_final.npy')

In [None]:
y_train=np.load('/content/drive/My Drive/y_train_final.npy')

In [None]:
y_train.shape[0]+y_cv.shape[0]

In [None]:
x_train.shape[0]+x_cv.shape[0]

In [None]:
print(x_train.shape,x_cv.shape,x_test.shape)

In [None]:
print(y_train.shape,y_cv.shape,y_test.shape)

In [None]:
408621+102156

In [None]:
408621+102156

Joining train and cv datasets since we are going to use sklearn's cross validation implementation and not manual

In [None]:
from scipy.sparse import vstack


In [None]:
x_train=vstack((x_train,x_cv))

In [None]:
x_train.shape

In [None]:
y_train=np.vstack((y_train.reshape(-1,1),y_cv.reshape(-1,1)))

In [None]:
y_train.shape

In [None]:
y_train=y_train.astype('int')

In [None]:
y_train.dtype

# Performing RandomSearchCv to check for best hyperparameters for RondomForestClassifier

In [None]:
random_grid = {'max_depth': [None],
               'min_samples_leaf': [100,1000],
               'min_samples_split': [100,1000],
               'n_estimators': [100,500]}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_tr, y_train)

best parameters:

In [None]:
rf_random.best_params_


In [None]:
rf_random.best_estimator_

In [None]:
from sklearn.metrics import log_loss

In [None]:
from sklearn.calibration import CalibratedClassifierCV

# Training RandomForestClassifier with best parameters

In [None]:

r_cfl=RandomForestClassifier(n_estimators=500,max_depth=None,min_samples_leaf=100,min_samples_split=100, random_state=42,n_jobs=-1)
r_cfl.fit(x_train,y_train)


In [None]:
pred=r_cfl.predict(x_cv)

# Training CalibratedClassifier on top of our base Classifier to get calibrated probabilities

In [None]:
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(x_train, y_train)
predict_y = sig_clf.predict_proba(x_cv)
loss=log_loss(y_cv, predict_y, labels=r_cfl.classes_, eps=1e-15)

In [None]:
loss

## **the RandomForestClassifier with parameters: 'max_depth' = None, 'min_samples_leaf' = 100, 'min_samples_split' = 100, 'n_estimators' = 500 gives log-loss of 1.85 on cross validation data**

# Training a completely random model to check worst possible log-loss

In [None]:
#test_data_len = X_test.shape[0]
cv_data_len = x_cv.shape[0]

# we create a output array that has exactly same size as the CV data
cv_predicted_y = np.zeros((cv_data_len,38))
for i in range(cv_data_len):
    rand_probs = np.random.rand(1,38)
    cv_predicted_y[i] = ((rand_probs/sum(sum(rand_probs)))[0])
print("Log loss on Cross Validation Data using Random Model",log_loss(y_cv,cv_predicted_y, eps=1e-15))

# saving the random forest models(calibrated and uncalibrated) in binary files.

In [None]:
import pickle
# save the model to disk
filename = 'rf_cv_log_loss_1.85.sav'
pickle.dump(r_cfl, open(filename, 'wb'))
 
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(X_test, Y_test)
#print(result)

In [None]:
filename = 'rf_calibrated_cv_log_loss_1.85.sav'
pickle.dump(sig_clf, open(filename, 'wb'))

## **worst possoble log-loss of a completely random model is 3.94**

##**Log-Loss for totally random model is 3.94 and for our RandomForestClassifier, log-loss is 1.84 i.e less than half of random model. Our model seems to be performing well.**