In [1]:
import numpy as np
import pandas as pd
import random 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
def makeData(columns, datas, count):
    df = pd.DataFrame(columns = columns)
    for _ in range(count):
        df.loc[len(df)] = {
            'week': datas['week'][random.randint(0, 4)],
            'weather': datas['weather'][random.randint(0, 2)],
            'temperate': random.randint(5, 35),
            'restorant': datas['restorant'][random.randint(0, 4)],
            'result': datas['result'][random.randint(0, 1)],
        }
    return df

In [3]:
columns = ['week','weather','temperate','restorant','result']
count = 200
datas = {
    'week': ['월','화','수','목','금'],
    'weather': ['맑음','흐림','비'],
    'restorant': ['칼국수','대구탕','햄버거','김치찌개','냉면'],
    'result': ['만족','불만족'],
}
data_df = makeData(columns, datas, count)

In [4]:
# select columns
feature_names = ['week','weather','temperate','restorant']
target_name = ["result"]
data_df = data_df[feature_names + target_name].reset_index(drop=True)

# remove Nan data
data_df.dropna(axis=0, how="any", inplace=True)

# split feature & target
dfX = data_df[feature_names].reset_index(drop=True)
dfy = data_df[target_name]

In [5]:
# LabelEncoder
label_list = ['week','weather','restorant']
for label in label_list:
    dfX[label] = LabelEncoder().fit_transform(dfX[label])

In [6]:
# OneHotEncoding
onehot_list = ['week','weather','restorant']
for onehot in onehot_list:
    column_list = []
    count = len(data_df[onehot].unique())
    for idx in range(count):
        column_list.append(onehot + "_" + str(idx))
    dfX2 = pd.DataFrame(OneHotEncoder().fit_transform(dfX[onehot].as_matrix()[:,np.newaxis]).toarray(), 
            columns=column_list, index=dfX.index)
    dfX = pd.concat([dfX, dfX2], axis=1)
    del(dfX[onehot])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.2, random_state=1)
model = DecisionTreeClassifier(criterion='entropy', max_depth=14, min_samples_leaf=10).fit(X_train, y_train)

In [8]:
print("accuracy score : {}".format(accuracy_score(y_test, model.predict(X_test))))
print(confusion_matrix(y_test, model.predict(X_test)))
print(classification_report(y_test, model.predict(X_test)))

accuracy score : 0.45
[[ 9  8]
 [14  9]]
             precision    recall  f1-score   support

         만족       0.39      0.53      0.45        17
        불만족       0.53      0.39      0.45        23

avg / total       0.47      0.45      0.45        40

