In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Author: Yicheng Jin

# **Step1 Obtain the data**

In [None]:
data = pd.read_csv('../input/otto-group-product-classification-challenge/train.csv')

In [None]:
data.describe()

In [None]:
data.shape #(61878, 95)

# **Step2 Data Visualization**

In [None]:
import seaborn as sns
sns.countplot(data.target)
plt.show()

**After statistically mapping the label column, it is found that the distribution of each label is uneven. This is because if all the data is used, the model will be biased, so the sampling method should be adopted.**

# **Step3 Processing of Data**

In [None]:
#Use random undersampling

y = data['target']
x = data.drop(['id','target'],axis=1)

In [None]:
from imblearn.under_sampling import RandomUnderSampler#random undersampling api

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(x,y)

In [None]:
print(X_resampled.shape,y_resampled.shape) #17361 rows × 93 columns, (17361, 93) (17361,)

**Observe the data label distribution after undersampling**

In [None]:
sns.countplot(y_resampled)
plt.show()

**After using undersampling, the data label distribution is very even, and the label value is converted into a numerical value, then using labelencoder**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)
y_resampled

# **Step4 Split data to training set and cv set**

In [None]:
from sklearn.model_selection import train_test_split
Xtrain,Xcv,ytrain,ycv = train_test_split(X_resampled,y_resampled,test_size = 0.2)


In [None]:
print(Xtrain.shape,Xcv.shape)#(13888, 93) (3473, 93)

# **Step5 Model Training**

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(oob_score=True)
rfc.fit(Xtrain,ytrain)

In [None]:
rfc.score(Xcv,ycv)

In [None]:
rfc.oob_score_

In [None]:
sns.countplot(rfc.predict(Xcv))
plt.show()

**Using Logloss to evaluate the model**

In [None]:
from sklearn.metrics import log_loss
# log_loss(ycv,rfc.predict(Xcv),eps=1e-15,normalize=True)#eps is the input parameter during logloss model evaluation，here is 10^-15

**An error will be reported here, because log_loss must require the output to be represented by one-hot
Thus we need to use OneHotEncoder to modify the output of multi-category problems**

In [None]:
ycv.reshape(-1,1)#reshape(-1,1) Convert to column vector

In [None]:
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(sparse=False)
ycv_onehot = one_hot.fit_transform(ycv.reshape(-1,1))
ypred_onehot = one_hot.fit_transform(rfc.predict(Xcv).reshape(-1,1))

In [None]:
ycv_onehot#Check whether it is done

In [None]:
log_loss(ycv_onehot,ypred_onehot,eps=1e-15,normalize=True) #7.707328449771324

**The current log_loss is 7.707, but the ypred here is directly obtained through rfc.predict, and the values are 0 and 1. We can use rfc.predict_proba to get the possibility of each category to reduce log_loss**

In [None]:
ypred_proba = rfc.predict_proba(Xcv)

In [None]:
ypred_proba #Similar to onehot output, no need to modify the shape

In [None]:
log_loss(ycv_onehot,ypred_proba,eps=1e-15,normalize=True) #0.7558184520641458 a little bit smaller 7.707->0.756

# **Step6 Model tuning**
# 

In [None]:
## Determine which hyperparameters need to be adjusted
# n_estimators, max_features, max_depth, min_samples_leaf

# **6.1. n_estimators**

In [None]:
#Determine the value range of n_estimators
tune_params = range(10,200,10)

#Create a numpy array "accuracy_t" that adds accuracy
accuracy_t = np.zeros(len(tune_params))

#Create a numpy array "error_t" that adds log_loss
error_t = np.zeros(len(tune_params))

#Tuning process realization
for i, param in enumerate(tune_params):
    rfc2 = RandomForestClassifier(oob_score=True,
                                 n_estimators = param,
                                 max_depth=10,
                                 max_features =10,
                                 min_samples_leaf=10,
                                 random_state =0,
                                 n_jobs=-1)
    rfc2.fit(Xtrain,ytrain)
    #Output accuracy and log_loss
    accuracy_t[i] = rfc2.oob_score_
    
    ypred = rfc2.predict_proba(Xcv)
    error_t[i] = log_loss(ycv_onehot,ypred,eps=1e-15,normalize=True)
    
    print(error_t[i])
    

In [None]:
#Visualization of optimization results
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4))
axes[0].plot(tune_params,error_t)
axes[1].plot(tune_params,accuracy_t)

axes[0].set_xlabel('n_estimators')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('n_estimators')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

# Analysis: According to the visualization results, determine n_setimators=175

# 6.2 max_features

In [None]:
#Determine the value range of max_features
tune_params = range(5,40,5)

#Create a numpy array "accuracy_t" that adds accuracy
accuracy_t = np.zeros(len(tune_params))

#Create a numpy array "error_t" that adds log_loss
error_t = np.zeros(len(tune_params))

#Tuning process realization
for i, param in enumerate(tune_params):
    rfc3 = RandomForestClassifier(oob_score=True,
                                 n_estimators = 175,
                                 max_depth=10,
                                 max_features =param,
                                 min_samples_leaf=10,
                                 random_state =0,
                                 n_jobs=-1)
    rfc3.fit(Xtrain,ytrain)
    #Output accuracy and log_loss
    accuracy_t[i] = rfc3.oob_score_
    
    ypred = rfc3.predict_proba(Xcv)
    error_t[i] = log_loss(ycv_onehot,ypred,eps=1e-15,normalize=True)
    
    print(error_t[i])

#Visualization of optimization results
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4))
axes[0].plot(tune_params,error_t)
axes[1].plot(tune_params,accuracy_t)

axes[0].set_xlabel('max_features')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('max_features')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

# Analysis: error gradually decreases to stable as max_features increases, but accuracy first increases and then decreases, so I choose max_features as 15, at this time accuracy does not drop much.


# 6.3 max_depth

In [None]:
#Determine the value range of max_depth
tune_params = range(10,50,10)

#Create a numpy array "accuracy_t" that adds accuracy
accuracy_t = np.zeros(len(tune_params))

#Create a numpy array "error_t" that adds log_loss
error_t = np.zeros(len(tune_params))

#Tuning process realization
for i, param in enumerate(tune_params):
    rfc4 = RandomForestClassifier(oob_score=True,
                                 n_estimators = 175,
                                 max_depth=param,
                                 max_features =15,
                                 min_samples_leaf=10,
                                 random_state =0,
                                 n_jobs=-1)
    rfc4.fit(Xtrain,ytrain)
    #Output accuracy and log_loss
    accuracy_t[i] = rfc4.oob_score_
    
    ypred = rfc4.predict_proba(Xcv)
    error_t[i] = log_loss(ycv_onehot,ypred,eps=1e-15,normalize=True)
    
    print(error_t[i])

#Visualization of optimization results
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4))
axes[0].plot(tune_params,error_t)
axes[1].plot(tune_params,accuracy_t)

axes[0].set_xlabel('max_depth')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('max_depth')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

# Analysis:  According to the result, the max_depth is around 30, which is more appropriate. If it is too high, it will overfit.

# 6.4 min_samples_leaf

In [None]:
#Determine the value range of min_samples_leaf
tune_params = range(1,10,2)

#Create a numpy array "accuracy_t" that adds accuracy
accuracy_t = np.zeros(len(tune_params))

#Create a numpy array "error_t" that adds log_loss
error_t = np.zeros(len(tune_params))

#Tuning process realization
for i, param in enumerate(tune_params):
    rfc5 = RandomForestClassifier(oob_score=True,
                                 n_estimators = 175,
                                 max_depth=30,
                                 max_features =15,
                                 min_samples_leaf=param,
                                 random_state =0,
                                 n_jobs=-1)
    rfc5.fit(Xtrain,ytrain)
    #Output accuracy and log_loss
    accuracy_t[i] = rfc5.oob_score_
    
    ypred = rfc5.predict_proba(Xcv)
    error_t[i] = log_loss(ycv_onehot,ypred,eps=1e-15,normalize=True)
    
    print(error_t[i])

#Visualization of optimization results
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4))
axes[0].plot(tune_params,error_t)
axes[1].plot(tune_params,accuracy_t)

axes[0].set_xlabel('min_samples_leaf')
axes[0].set_ylabel('error_t')
axes[1].set_xlabel('min_samples_leaf')
axes[1].set_ylabel('accuracy_t')

axes[0].grid(True)
axes[1].grid(True)

plt.show()

min_samples_leaf := 1

# Train an optimal rfc_best which min_samples_leaf=1, max_depth=30, max_features=15 and n_setimators=175.

In [None]:
test_data = pd.read_csv('../input/otto-group-product-classification-challenge/test.csv')
Xtest = test_data.drop(['id'],axis=1)
Xtest.head

In [None]:
rfc_best = RandomForestClassifier(oob_score=True,
                                 n_estimators = 175,
                                 max_depth=30,
                                 max_features =15,
                                 min_samples_leaf=1,
                                 random_state =0,
                                 n_jobs=-1)
rfc_best.fit(X_resampled,y_resampled)


In [None]:
rfc_best.oob_score_    #0.763536866359447  ---->  0.7752433615575139

In [None]:
y_test_proba = rfc_best.predict_proba(Xtest)
y_test_pred = rfc_best.predict(Xtest)

In [None]:
y_test_proba

# **Step7 Generate submission**

In [None]:
final = pd.DataFrame(y_test_proba,columns = ['Class_'+str(i) for i in range(1,10)])
final

add an id column

In [None]:
final.insert(loc=0,column='id',value = test_data.id)
final

In [None]:
final.to_csv('./final.csv',index=False)