In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from imblearn.under_sampling import RandomUnderSampler
import statsmodels.api as sm

### 데이터 읽기 및 확인

In [2]:
df = pd.read_csv("./data/E_commerce.csv")
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [3]:
df.shape

(10999, 12)

In [4]:
df.dtypes

ID                      int64
Warehouse_block        object
Mode_of_Shipment       object
Customer_care_calls     int64
Customer_rating         int64
Cost_of_the_Product     int64
Prior_purchases         int64
Product_importance     object
Gender                 object
Discount_offered        int64
Weight_in_gms           int64
Reached.on.Time_Y.N     int64
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


In [6]:
df.describe()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
count,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0
mean,5500.0,4.054459,2.990545,210.196836,3.567597,13.373216,3634.016729,0.596691
std,3175.28214,1.14149,1.413603,48.063272,1.52286,16.205527,1635.377251,0.490584
min,1.0,2.0,1.0,96.0,2.0,1.0,1001.0,0.0
25%,2750.5,3.0,2.0,169.0,3.0,4.0,1839.5,0.0
50%,5500.0,4.0,3.0,214.0,3.0,7.0,4149.0,1.0
75%,8249.5,5.0,4.0,251.0,4.0,10.0,5050.0,1.0
max,10999.0,7.0,5.0,310.0,10.0,65.0,7846.0,1.0


In [7]:
df["Reached.on.Time_Y.N"].value_counts()

Reached.on.Time_Y.N
1    6563
0    4436
Name: count, dtype: int64

In [8]:
df["Mode_of_Shipment"].value_counts()

Mode_of_Shipment
Ship      7462
Flight    1777
Road      1760
Name: count, dtype: int64

In [9]:
df["Warehouse_block"].value_counts()

Warehouse_block
F    3666
D    1834
A    1833
B    1833
C    1833
Name: count, dtype: int64

In [10]:
df["Gender"].value_counts()

Gender
F    5545
M    5454
Name: count, dtype: int64

In [11]:
df["Product_importance"].value_counts()

Product_importance
low       5297
medium    4754
high       948
Name: count, dtype: int64

### 미사용 피처 제거

In [12]:
ndf = df.drop(["ID", "Gender"], axis = 1)

In [13]:
ndf.head()

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,D,Flight,4,2,177,3,low,44,1233,1
1,F,Flight,4,5,216,2,low,59,3088,1
2,A,Flight,2,2,183,4,low,48,3374,1
3,B,Flight,3,3,176,4,medium,10,1177,1
4,C,Flight,2,2,184,3,medium,46,2484,1


### 범주형 데이터 처리

In [14]:
ndf_obj = ndf[["Warehouse_block", "Mode_of_Shipment", "Product_importance"]]

In [15]:
ndf = ndf.drop(["Warehouse_block", "Mode_of_Shipment", "Product_importance"], axis = 1)

In [16]:
ndf_obj = pd.concat([pd.get_dummies(ndf_obj["Warehouse_block"], dtype = "int64"),
          pd.get_dummies(ndf_obj["Mode_of_Shipment"], dtype = "int64"),
          pd.get_dummies(ndf_obj["Product_importance"], dtype = "int64")], axis = 1)

### 데이터 분할

In [17]:
x = pd.concat([ndf, ndf_obj], axis = 1).drop("Reached.on.Time_Y.N", axis = 1)
y = ndf["Reached.on.Time_Y.N"]

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y, random_state = 10)

In [19]:
len(x_train), len(x_test)

(8249, 2750)

### 랜덤언더샘플링 적용

In [20]:
x_train_re = x_train.copy()
y_train_re = y_train.copy()

x_tmp_name = [f"X{i}" for i in range(1, 18)]
y_tmp_name = ["y1"]

In [21]:
x_train_re.columns = x_tmp_name
y_train_re.columns = y_tmp_name

In [22]:
x_train_under, y_train_under = RandomUnderSampler(random_state = 11).fit_resample(x_train_re, y_train_re)

In [23]:
print("랜덤샘플러 적용 전", x_train_re.shape, y_train_re.shape)
print("랜덤샘플러 적용 후", x_train_under.shape, y_train_under.shape)

랜덤샘플러 적용 전 (8249, 17) (8249,)
랜덤샘플러 적용 후 (6654, 17) (6654,)


In [24]:
x_train_under.columns = list(x_train)
y_train_under.columns = list(y_train)

### 스케일링 적용

In [25]:
scale_cols = x_train_under.iloc[:, :6].columns.tolist()

In [26]:
ss = StandardScaler()
scaled_train = pd.DataFrame(ss.fit_transform(x_train_under[scale_cols]), columns = scale_cols)
scaled_test = pd.DataFrame(ss.transform(x_test[scale_cols]), columns = scale_cols)

In [27]:
scaled_train = pd.concat([scaled_train, x_train_under.drop(scale_cols, axis = 1).reset_index(drop = True)], axis = 1)
scaled_test = pd.concat([scaled_test, x_test.drop(scale_cols, axis = 1).reset_index(drop = True)], axis = 1)

In [28]:
scaled_train.columns

Index(['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms', 'A', 'B', 'C',
       'D', 'F', 'Flight', 'Road', 'Ship', 'high', 'low', 'medium'],
      dtype='object')

In [29]:
scaled_test.columns

Index(['Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product',
       'Prior_purchases', 'Discount_offered', 'Weight_in_gms', 'A', 'B', 'C',
       'D', 'F', 'Flight', 'Road', 'Ship', 'high', 'low', 'medium'],
      dtype='object')

### 모델 학습 및 평가

In [30]:
logi = LogisticRegression()
logi.fit(scaled_train, y_train_under)

In [31]:
print(logi.score(scaled_train, y_train_under))
print(logi.score(scaled_test, y_test))

0.6896603546738804
0.6432727272727272


### CSV 파일 생성

In [40]:
logi.predict(scaled_test)

array([1, 1, 1, ..., 0, 1, 0], dtype=int64)

In [32]:
pred_proba = pd.DataFrame(logi.predict_proba(scaled_test).reshape(-1, 2))

In [33]:
pred_proba = pred_proba.drop(1, axis = 1)

In [34]:
pred_proba

Unnamed: 0,0
0,0.446429
1,0.371544
2,0.313840
3,0.424459
4,0.510290
...,...
2745,0.707488
2746,0.702992
2747,0.751413
2748,0.472191


In [35]:
result = pd.concat([pd.Series(y_test.index), pred_proba], axis = 1)

In [36]:
result.columns = ["테스트데이터의 ID", "정시도착 여부 예측확률"]

In [37]:
result

Unnamed: 0,테스트데이터의 ID,정시도착 여부 예측확률
0,5778,0.446429
1,10985,0.371544
2,1225,0.313840
3,6502,0.424459
4,7337,0.510290
...,...,...
2745,8656,0.707488
2746,6337,0.702992
2747,4695,0.751413
2748,5932,0.472191


In [38]:
result.to_csv("./3.28시험_황상일.csv")

In [39]:
pd.read_csv("./3.28시험_황상일.csv")

Unnamed: 0.1,Unnamed: 0,테스트데이터의 ID,정시도착 여부 예측확률
0,0,5778,0.446429
1,1,10985,0.371544
2,2,1225,0.313840
3,3,6502,0.424459
4,4,7337,0.510290
...,...,...,...
2745,2745,8656,0.707488
2746,2746,6337,0.702992
2747,2747,4695,0.751413
2748,2748,5932,0.472191
