# 문제4 - ML 모델링
- 아래 E-Commerce Shipping Data의 train set을 참조하여 고객이 주문한 물품의 정시 도 착 여부를 예측하시오
- ID와 예측치는 csv 파일로 저장하시오 (EC_result.csv)

- **(주의사항) ML 모델에서 X값은 rank-2 numpy 배열이나 DataFrame만 들어간다.**
- **(주의사항) ML 모델에서 y값은 rank-1 numpy 배열이나 Series만 들어간다.** 
- **(주의사항) ML 모델에서 X, y값으로 list는 사용할 수 없다.**

In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import os

path = '/Users/jsha/gjai/python_basic/pytest_basic/'
X_test = pd.read_csv(path+'EC_X_test.csv')
X_train = pd.read_csv(path+'EC_X_train.csv')
y_train = pd.read_csv(path+'EC_y_train.csv')

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(8799, 11)
(8799, 2)
(2200, 11)


In [133]:
# sort_values by 'ID'
# X_train, y_train의 짝이 잘 맞아야 하는데 혹시 모르니까.
X_train = X_train.sort_values(by='ID')
y_train = y_train.sort_values(by='ID')
categorical_cols = X_train.columns[X_train.dtypes == object]

for col in categorical_cols:
    print(X_train[col].value_counts())
    print('-'*50)

F    2941
A    1489
B    1465
C    1458
D    1446
Name: Warehouse_block, dtype: int64
--------------------------------------------------
Ship      5913
Flight    1449
Road      1437
Name: Mode_of_Shipment, dtype: int64
--------------------------------------------------
low       4247
medium    3783
high       769
Name: Product_importance, dtype: int64
--------------------------------------------------
F    4456
M    4343
Name: Gender, dtype: int64
--------------------------------------------------


In [154]:
# categorical columns --> one-hot encoding
# 원핫 인코딩 결과는 데이터프레임으로 나온다.
X_train_df = pd.get_dummies(X_train, columns=categorical_cols)
X_test_df = pd.get_dummies(X_test, columns=categorical_cols)
print(type(X_train_df))
print(X_train_df.shape)
print(X_test_df.shape)

<class 'pandas.core.frame.DataFrame'>
(8799, 20)
(2200, 20)


In [155]:
# 정규화 작업으로 MinMaxscaling을 이용한다.
# 정규화 작업 결과는 ndarray 형식으로 나온다.

mms = MinMaxScaler()
mms.fit(X_train_df)
X_train_scaled = mms.transform(X_train_df)
X_test_scaled = mms.transform(X_test_df)
print(type(X_train_scaled))

<class 'numpy.ndarray'>


## 모델1. LogisticRegression

In [139]:
# modeling for classification and prediction
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train_scaled[:, 1:], y_train.iloc[:,-1])
print(lr.score(X_train_scaled[:, 1:], y_train.iloc[:,-1]))
y_test_pred = lr.predict(X_test_scaled[:, 1:])

0.6401863848164564


In [137]:
result_df = pd.concat([test['ID'], pd.Series(y_test_pred)], ignore_index=True, axis=1)
result_df.columns = ['ID', 'predict_reached_on_time']
result_df.to_csv('EC_result.csv')

if os.path.exists('EC_result.csv', index=False):
    print('yes. good!!')

yes. good!!


## 모델2. SVC
- svm의 svc

In [140]:
# modeling for classification and prediction
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svc = SVC()
svc.fit(X_train_scaled[:, 1:], y_train.iloc[:,-1])
print(svc.score(X_train_scaled[:, 1:], y_train.iloc[:,-1]))

0.6896238208887373


## 모델3. SGDClassifier
- Stochastic Gradient Descent

In [141]:
# modeling for classification and prediction
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

sgdc = SGDClassifier()
sgdc.fit(X_train_scaled[:, 1:], y_train.iloc[:,-1])
print(sgdc.score(X_train_scaled[:, 1:], y_train.iloc[:,-1]))

0.6608705534719854


## 모델4. DecisionTreeClassifier
- overfitting이 될것.
- 이걸 막아줘야 하는데. 옵션 max_depth=2

In [153]:
# modeling for classification and prediction
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(X_train_scaled[:, 1:], y_train.iloc[:,-1])
print(dtc.score(X_train_scaled[:, 1:], y_train.iloc[:,-1]))

0.6820093192408229


## 모델6. AdaBoostClassifier
- ensenble 모델
- https://scikit-learn.org/stable/modules/ensemble.html#adaboost

In [None]:
# modeling for classification and prediction
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

abc = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(abc, X_train_scaled[:, 1:], y_train.iloc[:,-1], cv=5)
scores.mean()

## 모델7. RandomForest
- ensenble 모델
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=randomforest#sklearn.ensemble.RandomForestClassifier

In [162]:
# modeling for classification and prediction
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

rfc = RandomForestClassifier(max_depth=2, random_state=0)
scores = cross_val_score(rfc, X_train_scaled[:, 1:], y_train.iloc[:,-1], cv=5)
scores.mean()

0.5984773760917876