In [70]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [52]:
# moons dataset
from sklearn.datasets import make_moons


X, y = make_moons(n_samples=500, noise=0.15)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42,)

In [300]:
def display_score(y_test, y_pred):
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print(f'f1: {f1}\nroc: {roc}\nacc: {acc}')

In [253]:
example_path = './datasets/exam_sample'
X_train = pd.read_csv(f'{example_path}/x_train.csv')
y_train = pd.read_csv(f'{example_path}/y_train.csv')
target_x_test = pd.read_csv(f'{example_path}/x_test.csv')

In [278]:
y_train

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
3495,3495,1
3496,3496,1
3497,3497,0
3498,3498,0


In [254]:
X_train.describe()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,내점일수,내점당구매건수,주말방문비율,구매주기
count,3500.0,3500.0,3500.0,1205.0,3500.0,3500.0,3500.0,3500.0
mean,1749.5,91919250.0,19664240.0,24078220.0,19.253714,2.834963,0.307246,20.958286
std,1010.507298,163506500.0,31992350.0,47464530.0,27.174942,1.912368,0.289752,24.748682
min,0.0,-52421520.0,-2992000.0,5600.0,1.0,1.0,0.0,0.0
25%,874.75,4747050.0,2875000.0,2259000.0,2.0,1.666667,0.027291,4.0
50%,1749.5,28222700.0,9837000.0,7392000.0,8.0,2.333333,0.25641,13.0
75%,2624.25,106507900.0,22962500.0,24120000.0,25.0,3.375,0.44898,28.0
max,3499.0,2323180000.0,706629000.0,563753000.0,285.0,22.083333,1.0,166.0


In [255]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  3500 non-null   int64  
 1   총구매액     3500 non-null   int64  
 2   최대구매액    3500 non-null   int64  
 3   환불금액     1205 non-null   float64
 4   주구매상품    3500 non-null   object 
 5   주구매지점    3500 non-null   object 
 6   내점일수     3500 non-null   int64  
 7   내점당구매건수  3500 non-null   float64
 8   주말방문비율   3500 non-null   float64
 9   구매주기     3500 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 273.6+ KB


In [287]:
import copy
train_df = copy.deepcopy(X_train)
train_df['gender'] = y_train['gender']

In [246]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(train_df, train_df['gender']):
    strat_train_set = train_df.loc[train_index]
    strat_test_set = train_df.loc[test_index]


In [248]:
train_df = strat_train_set.drop(['cust_id','gender'],axis=1)
train_df_labels = strat_train_set['gender'].copy()
test_df = strat_test_set.drop(['cust_id','gender'],axis=1)
test_df_labels = strat_test_set['gender'].copy()

In [262]:
train_df = X_train.drop(['cust_id','주구매지점'],axis=1)
train_df_labels = y_train['gender'].copy()

In [279]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

metric_columns = ['총구매액','최대구매액','환불금액','내점일수','내점당구매건수','주말방문비율','구매주기']
# cat_columns = ['주구매상품','주구매지점']
cat_columns = ['주구매상품']

metric_pipeline = Pipeline([
    ('fillna', SimpleImputer(strategy='constant',fill_value=0)),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])

full_pipeline = ColumnTransformer([
    ('metric', metric_pipeline, metric_columns),
    ('cat', cat_pipeline, cat_columns)
])

prepared_train_df = full_pipeline.fit_transform(train_df)
# prepared_test_df = full_pipeline.fit_transform(test_df)

In [268]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(prepared_train_df, train_df_labels)



RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [305]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(prepared_train_df, train_df_labels)

lin_y_pred = lin_reg.predict(prepared_train_df)

lin_mse = mean_squared_error(train_df_labels, lin_y_pred)
lin_rmse = np.sqrt(lin_mse)
lin_rmse



# display_score(train_df_labels, lin_y_pred)

0.461498102509965

In [322]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from  sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[
        ('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)
    ],
    voting='hard'
)
voting_clf.fit(prepared_train_df, train_df_labels)


VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [324]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(prepared_train_df, train_df_labels)
    y_pred = clf.predict(prepared_train_df)
    print(clf.__class__.__name__, accuracy_score(train_df_labels, y_pred))

LogisticRegression 0.6625714285714286
RandomForestClassifier 0.9994285714285714
SVC 0.6874285714285714
VotingClassifier 0.7154285714285714


In [269]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rnd_clf, prepared_train_df, train_df_labels, scoring='neg_mean_squared_error', cv=10)
tree_rmse_scores = np.sqrt(-scores)
tree_rmse_scores
a
def display_scores(scores):
    print(pd.DataFrame(scores).describe())
display_scores(tree_rmse_scores)    

               0
count  10.000000
mean    0.585900
std     0.021998
min     0.547723
25%     0.575682
50%     0.586757
75%     0.603557
max     0.614120


In [284]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)

bag_clf.fit(prepared_train_df, train_df_labels)
print('검증세트 결과: ',bag_clf.oob_score_)
y_pred = bag_clf.predict(prepared_train_df)
display_score(train_df_labels, y_pred)

검증세트 결과:  0.648
f1: 0.4453240969816922
roc: 0.6125935234977788
acc: 0.6797142857142857


In [270]:

y_pred_rf = rnd_clf.predict(prepared_train_df)
display_score(train_df_labels,y_pred_rf)

f1: 0.43939393939393945
roc: 0.6128468163042631
acc: 0.6828571428571428


In [75]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)

bag_clf.fit(X_train, y_train)
print('검증세트 결과: ',bag_clf.oob_score_)
y_pred = bag_clf.predict(X_test)
display_score(y_test, y_pred)


검증세트 결과:  0.9625
f1: 0.9320388349514563
roc: 0.93
acc: 0.93


In [76]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)
display_score(y_test,y_pred_rf)

for name, score in zip(['1','2'], rnd_clf.feature_importances_):
    print(name, score)


f1: 0.970873786407767
roc: 0.97
acc: 0.97
1 0.43556808985407874
2 0.5644319101459212
