# Decision Tree Practice

## Import packages

In [1]:
import pandas as pd
import numpy as np
import random
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.externals.six import StringIO   

## Data Generation
Create data of 100 basketball teams.<br>
Each of them has 5 attributes(affecting whether the team can make the playoffs):<br>
- 例行賽勝率(win_rate)
- 例行賽平均得分(win_pts)
- 例行賽平均失分(lose_pts)
- 例行賽平均洋將得分(foreign_pts)
- 中華隊球員數(CT_player)
<br>
<br>
使用以上五個attributes，預測該支球隊能不能進入季後賽。

## Data Rules
<img src="data/my_plot.jpg" width="700">

In [2]:
def enter_playoff(data):
    if data[0] > 0.6:
        if data[1] > 85:
            if data[2] < 80:
                return 1
            elif data[3] > 35:
                return 1
            else:
                return 0
        elif data[2] < 80:
            if data[3] > 40:
                return 1
            else:
                return 0
        else:
            return 0
    elif data[2] < 83:
        if data[3] > 30:
            if data[4] > 3:
                return 1
            else:
                return 0
        else:
            return 0
    else:
        return 0

def create_data(writer):
    win_rate = round(random.uniform(0.3, 0.7), 2)
    win_pts = round(random.uniform(80, 90), 2)
    lose_pts = round(random.uniform(82, 90), 2)
    foreign_pts = round(random.uniform(25, 45), 2)
    CT_player = round(random.randint(0, 5), 2)
    row = [win_rate, win_pts, lose_pts, foreign_pts, CT_player]
    enter = enter_playoff(row)
    row.append(enter)
    writer.writerow(row)

with open('data/team.csv', 'w') as csvfile:
    filewriter = csv.writer(csvfile, delimiter=',')
    filewriter.writerow(['勝率', '得分', '失分', '洋將得分', '中華隊球員', '進季後賽'])
    for i in range(1000):
        create_data(filewriter)

In [3]:
data = pd.read_csv('data/team.csv')
data.head(10)

Unnamed: 0,勝率,得分,失分,洋將得分,中華隊球員,進季後賽
0,0.32,87.41,86.13,36.85,1,0
1,0.37,82.35,86.12,33.67,2,0
2,0.56,87.0,86.32,30.86,5,0
3,0.48,89.87,85.03,32.47,4,0
4,0.42,84.61,82.92,42.48,4,1
5,0.3,80.13,83.13,41.79,2,0
6,0.51,87.7,89.86,36.49,5,0
7,0.69,84.15,87.22,39.17,5,0
8,0.67,89.17,82.42,35.16,5,1
9,0.44,84.22,87.36,35.45,0,0


In [4]:
answer = data['進季後賽'].values
data = data.drop('進季後賽', 1)

In [5]:
train_x = data[:700]
test_x = data[700:]
train_y = answer[:700]
test_y = answer[700:]

In [6]:
dtree=DecisionTreeClassifier(max_depth=4)
dtree.fit(train_x,train_y)

dot_data = StringIO()
export_graphviz(dtree, 
                out_file=dot_data,  
                filled=True, 
                feature_names=list(data),
                class_names=['Not Enter','Enter'],
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("data/tree.pdf")

True

## Result

在tree的上半部份跟原本自己畫得很像，下半部的比較順序則有一些差異，不過抓到的臨界點（小於多少或大於多少）跟原本定的差不多。

<img src="data/tree.png" width="700">

由於 training 和 testing 資料是根據相同的規則產生的，且彼此間沒有矛盾<br>
因此在 testing data 的預測上相當準確。

In [7]:
y_predict = dtree.predict(test_x)

In [8]:
from sklearn.metrics import accuracy_score

accuracy_score(test_y, y_predict)

1.0