In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

In [3]:
sklearn.__all__

['calibration',
 'cluster',
 'covariance',
 'cross_decomposition',
 'datasets',
 'decomposition',
 'dummy',
 'ensemble',
 'exceptions',
 'experimental',
 'externals',
 'feature_extraction',
 'feature_selection',
 'gaussian_process',
 'inspection',
 'isotonic',
 'kernel_approximation',
 'kernel_ridge',
 'linear_model',
 'manifold',
 'metrics',
 'mixture',
 'model_selection',
 'multiclass',
 'multioutput',
 'naive_bayes',
 'neighbors',
 'neural_network',
 'pipeline',
 'preprocessing',
 'random_projection',
 'semi_supervised',
 'svm',
 'tree',
 'discriminant_analysis',
 'impute',
 'compose',
 'clone',
 'get_config',
 'set_config',
 'config_context',
 'show_versions']

In [4]:
dir(sklearn.model_selection)

['BaseCrossValidator',
 'BaseShuffleSplit',
 'GridSearchCV',
 'GroupKFold',
 'GroupShuffleSplit',
 'KFold',
 'LeaveOneGroupOut',
 'LeaveOneOut',
 'LeavePGroupsOut',
 'LeavePOut',
 'ParameterGrid',
 'ParameterSampler',
 'PredefinedSplit',
 'RandomizedSearchCV',
 'RepeatedKFold',
 'RepeatedStratifiedKFold',
 'ShuffleSplit',
 'StratifiedGroupKFold',
 'StratifiedKFold',
 'StratifiedShuffleSplit',
 'TimeSeriesSplit',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_search',
 '_split',
 '_validation',
 'check_cv',
 'cross_val_predict',
 'cross_val_score',
 'cross_validate',
 'learning_curve',
 'permutation_test_score',
 'train_test_split',
 'typing',
 'validation_curve']

In [5]:
# 정시 배송 여부 판단
x_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_train.csv")
y_train = pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/y_train.csv")
x_test= pd.read_csv("https://raw.githubusercontent.com/Datamanim/datarepo/main/shipping/X_test.csv")

x_train.shape, y_train.shape, x_test.shape


((6598, 11), (6598, 2), (4401, 11))

In [6]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   6598 non-null   int64 
 1   Warehouse_block      6598 non-null   object
 2   Mode_of_Shipment     6598 non-null   object
 3   Customer_care_calls  6598 non-null   object
 4   Customer_rating      6598 non-null   int64 
 5   Cost_of_the_Product  6598 non-null   int64 
 6   Prior_purchases      6598 non-null   int64 
 7   Product_importance   6598 non-null   object
 8   Gender               6598 non-null   object
 9   Discount_offered     6598 non-null   int64 
 10  Weight_in_gms        6598 non-null   int64 
dtypes: int64(6), object(5)
memory usage: 567.1+ KB


In [7]:
x_train.isna().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
dtype: int64

In [8]:
x_train.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms
0,6045,A,Flight,4,3,266,5,high,F,5,1590
1,44,F,Ship,3,1,174,2,low,M,44,1556
2,7940,F,Road,4,1,154,10,high,M,10,5674
3,1596,F,Ship,4,3,158,3,medium,F,27,1207
4,4395,A,Flight,5,3,175,3,low,M,7,4833


In [35]:
y_train.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
0,6045,0
1,44,1
2,7940,1
3,1596,1
4,4395,1


In [9]:
drop_col = ['ID']
x_train_drop = x_train.drop(columns=drop_col)
x_test_drop = x_test.drop(columns=drop_col)

x_train_dummies = pd.get_dummies(x_train_drop)
x_test_dummies = pd.get_dummies(x_test_drop)
x_test_dummies = x_test_dummies[x_train_dummies.columns]


In [11]:
y = y_train['Reached.on.Time_Y.N']

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(
    x_train_dummies, y, test_size=0.33, random_state=42
)

In [13]:
rf = RandomForestClassifier(random_state=23)
rf.fit(X_train, Y_train)

RandomForestClassifier(random_state=23)

In [29]:
X_train_pd = rf.predict(X_train)
X_train_prob = rf.predict_proba(X_train)[:, 1]

In [30]:
X_test_pd = rf.predict(X_test)
X_test_prob = rf.predict_proba(X_test)[:, 1]

In [21]:
accuracy_score(Y_train, X_train_pd), accuracy_score(Y_test, X_test_pd)

(1.0, 0.6395775941230487)

In [23]:
f1_score(Y_train, X_train_pd), f1_score(Y_test, X_test_pd)

(1.0, 0.6744089589382)

In [25]:
recall_score(Y_train, X_train_pd), recall_score(Y_test, X_test_pd)

(1.0, 0.630721489526765)

In [26]:
precision_score(Y_train, X_train_pd), precision_score(Y_test, X_test_pd)

(1.0, 0.7245989304812834)

In [31]:
roc_auc_score(Y_train, X_train_prob), roc_auc_score(Y_test, X_test_prob)

(1.0, 0.7261997118475008)

In [33]:
x_test_dummies_pd = rf.predict(x_test_dummies)
x_test_dummies_prob = rf.predict_proba(x_test_dummies)[:, 1]

In [38]:
df = pd.DataFrame({'ID' : x_test.ID , 'Reached.on.Time_Y.N' : x_test_dummies_pd })
df.to_csv('03result.csv', index=False)
df.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
0,6811,1
1,4320,0
2,5732,0
3,7429,1
4,2191,1


In [39]:
df = pd.DataFrame({'ID' : x_test.ID , 'Reached.on.Time_Y.N' : x_test_dummies_prob })
df.to_csv('03result_prob.csv', index=False)
df.head()

Unnamed: 0,ID,Reached.on.Time_Y.N
0,6811,0.53
1,4320,0.39
2,5732,0.38
3,7429,0.52
4,2191,1.0
