# 의사결정나무 예제

## 타이타닉 데이터

In [None]:
import pandas as pd
import numpy as np
from statsmodels.graphics.mosaicplot import mosaic
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
titanic = pd.read_csv("titanic.csv")
titanic

### Exploratory analysis : table

In [None]:
pd.crosstab(titanic['Survived'], titanic['Sex']) 

In [None]:
pd.crosstab(titanic['Survived'], titanic['Class'])

In [None]:
pd.crosstab(titanic['Survived'], titanic['Age'])

### Exploratory analysis : mosaic plot

In [None]:
# AGE
mosaic(titanic, ["Age","Survived"])

In [None]:
# Sex
mosaic(titanic, ["Sex","Survived"])

In [None]:
# Class
mosaic(titanic, ["Class","Survived"])

In [None]:
# factor to number
titanic.Survived = titanic.Survived.replace('No', 0)
titanic.Survived = titanic.Survived.replace('Yes', 1)

titanic.Age = titanic.Age.replace('Child', 0)
titanic.Age = titanic.Age.replace('Adult', 1)

titanic.Sex = titanic.Sex.replace('Male', 0)
titanic.Sex = titanic.Sex.replace('Female', 1)

titanic.Class = titanic.Class.replace('First', 1)
titanic.Class = titanic.Class.replace('Second', 2)
titanic.Class = titanic.Class.replace('Third', 3)
titanic.Class = titanic.Class.replace('Crew', 4)

In [None]:
# Data for tree modeling
X = titanic.drop('Survived', axis = 1)
y = titanic['Survived']
xname = X.columns
yname = ['Die','Survived']

In [None]:
# Decision Tree
DT = tree.DecisionTreeClassifier(max_depth = 3, min_impurity_decrease=0.003)
DT_fit = DT.fit(X, y)
plt.subplots(figsize=(11, 9))
tree.plot_tree(DT_fit, feature_names=xname, class_names=yname, filled=True) 

## 구매 예측 예제

In [None]:
import pandas as pd
import numpy as np
direct2 = pd.read_csv('direct2.csv')
direct2

In [None]:
direct2.isnull().sum()

In [None]:
direct2 = direct2.dropna()
direct2.shape

In [None]:
X = direct2.drop('Buy',axis = 'columns')
y = direct2['Buy']

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

In [None]:
#model_tree = DecisionTreeClassifier(max_depth=2)
model_tree = DecisionTreeClassifier(min_impurity_decrease=0.0005, min_samples_split=100)
fit_tree = model_tree.fit(X, y)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(24, 12))
xname = X.columns
yname = ['Nonbuyer','Buyer']
plot_tree(fit_tree, feature_names=xname, class_names=yname, filled=True, fontsize=10) 

In [None]:
smith = [35, 1, 1, 2, 500, 3, 3, 18, 1] 
johnson = [36, 0, 1, 1, 550, 2, 1, 15, 1]
people = pd.DataFrame([smith, johnson],
                      columns=['Age','Gender','Married','Children',
                               'Income','Ccard','Recent','Climate','Urban'])
people

In [None]:
fit_tree.predict_proba(people)[:,1]

In [None]:
from sklearn.metrics import plot_roc_curve
roc_tree = plot_roc_curve(fit_tree, X, y)
roc_tree.figure_.suptitle("ROC curve")

# 신경망 분석 예제

In [None]:
import pandas as pd
import numpy as np
direct2 = pd.read_csv('direct2.csv')
direct2

In [None]:
direct2.isnull().sum()

In [None]:
direct2 = direct2.dropna()
direct2.shape

In [None]:
X = direct2.drop('Buy',axis = 'columns')
y = direct2['Buy']

In [None]:
smith = [35, 1, 1, 2, 500, 3, 3, 18, 1] 
johnson = [36, 0, 1, 1, 550, 2, 1, 15, 1]
people = pd.DataFrame([smith, johnson],
                      columns=['Age','Gender','Married','Children',
                               'Income','Ccard','Recent','Climate','Urban'])
people

In [None]:
temp = pd.concat([X, people], axis = 0)
temp

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
temp_scaled = scaler.fit_transform(temp)
temp_scaled = pd.DataFrame(temp_scaled,columns=['Age','Gender','Married','Children','Income','Ccard','Recent','Climate','Urban'])
temp_scaled

In [None]:
X_scaled = temp_scaled.iloc[:-2,:]
X_scaled

In [None]:
people = temp_scaled.iloc[-2:,:]
people

In [None]:
model_nn = MLPClassifier(hidden_layer_sizes=(5),random_state=1234, max_iter = 2000)
fit_nn = model_nn.fit(X_scaled, y)

In [None]:
fit_nn.predict_proba(people)[:,1]

In [None]:
from sklearn.metrics import plot_roc_curve
roc_nn = plot_roc_curve(fit_nn, X_scaled, y)
roc_nn.figure_.suptitle("ROC curve")

# SVM 분석 예제

In [None]:
import pandas as pd
import numpy as np
direct2 = pd.read_csv('direct2.csv')
direct2

In [None]:
direct2 = direct2.dropna()
direct2.shape

In [None]:
X = direct2.drop('Buy',axis = 'columns')
y = direct2['Buy']

In [None]:
smith = [35, 1, 1, 2, 500, 3, 3, 18, 1] 
johnson = [36, 0, 1, 1, 550, 2, 1, 15, 1]
people = pd.DataFrame([smith, johnson],
                      columns=['Age','Gender','Married','Children',
                               'Income','Ccard','Recent','Climate','Urban'])
people

In [None]:
from sklearn.svm import SVC
#model_svm = SVC(kernel='linear', probability=True, random_state=1234)
model_svm = SVC(kernel='poly', C=0.1, probability=True, random_state=1234)
fit_svm = model_svm.fit(X, y) # very slow

In [None]:
fit_svm.predict_proba(people)[:,1]

In [None]:
from sklearn.metrics import plot_roc_curve
roc_svm = plot_roc_curve(fit_svm, X, y)
roc_svm.figure_.suptitle("ROC curve")