# 지도학습 - Classification

### 라이브러리 선언 / 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd 

# 모델 라이브러리 선언
from sklearn import svm

## 훈련/테스트 데이터 자동 분리
from sklearn.model_selection import train_test_split

# 모델 정확도 라이브러리 선언
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#CSV 파일을 읽어 DataFrame 변수에 저장하기
csData = pd.read_csv("../dataset/customer.csv")
csData.head()

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip


# 1. 데이터불러오기 / 타입통합

In [2]:
#CSV 파일을 읽어 DataFrame 변수에 저장하기
csData = pd.read_csv("../dataset/customer.csv")
csData.head()

Unnamed: 0,balance,stock,label
0,30000000,22500000,normal
1,280000000,48000000,diamond
2,300000000,40666666,diamond
3,54000000,28000000,normal
4,768000000,32000000,vip


In [3]:
csData.describe()

Unnamed: 0,balance,stock
count,20000.0,20000.0
mean,560225500.0,38761650.0
std,197896700.0,15412750.0
min,30000000.0,17500000.0
25%,296000000.0,26666670.0
50%,636000000.0,35500000.0
75%,720000000.0,47333330.0
max,800000000.0,80000000.0


In [4]:
# 라벨 유형 확인
labels = csData.label.drop_duplicates()
print(labels)

# 상관관계 분석을 위한 라벨코드 컬럼 추가
labelDict = {"normal":0,"diamond":1,"vip":2}
csData["labelcode"] = csData.label.map(labelDict)
csData.head()

0     normal
1    diamond
4        vip
Name: label, dtype: object


Unnamed: 0,balance,stock,label,labelcode
0,30000000,22500000,normal,0
1,280000000,48000000,diamond,1
2,300000000,40666666,diamond,1
3,54000000,28000000,normal,0
4,768000000,32000000,vip,2


In [5]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# features=["balance","stock"]
# scaler.fit(csData.loc[:,features])
# x_train_norm = scaler.transform(csData.loc[:,features])

# 2. 특성선정 / 데이터 분리

In [6]:
def minMaxNorm(indata):
    maxValue = max(indata)
    minValue = min(indata)
    deNormValue = maxValue - minValue
    # 전체 양수로 변경
    plusData = indata - minValue
    # 최대값에서 각 데이터 변경
    outData = indata
    if deNormValue != 0 :
        outData = plusData / deNormValue
    else:
        pass
    return outData

csData["balance_norm"] = minMaxNorm(csData.balance)
csData["stock_norm"] = minMaxNorm(csData.stock)

csData.corr()

Unnamed: 0,balance,stock,labelcode,balance_norm,stock_norm
balance,1.0,0.565942,0.883144,1.0,0.565942
stock,0.565942,1.0,0.824174,0.565942,1.0
labelcode,0.883144,0.824174,1.0,0.883144,0.824174
balance_norm,1.0,0.565942,0.883144,1.0,0.565942
stock_norm,0.565942,1.0,0.824174,0.565942,1.0


In [7]:
### feature, label 컬럼 설정
featuresCol = ['balance_norm','stock_norm']
labelCol = ['label']

### feature 데이터, label 데이터 분리
featuresData = csData.loc[:,featuresCol]
labelData = csData.loc[:,labelCol]

### train_test_split 함수를 활용해 feature / label 데이터 분리 7:3
trainingData_features, \
testData_features, \
trainingData_label, \
testData_label = \
    train_test_split(featuresData, labelData, test_size = 0.3,
    random_state = 1)

print(trainingData_features.shape)
print(testData_features.shape)
print(trainingData_label.shape)
print(testData_label.shape)

(14000, 2)
(6000, 2)
(14000, 1)
(6000, 1)


# 3. 모델선언 및 학습

In [8]:
# 모델 정의
model_method = svm.SVC(random_state=1)

# 머신 러닝 (훈련데이터 특성/답지)
model = model_method.fit( X=trainingData_features, 
                          y=trainingData_label.label)

# 4. 예측

In [9]:
## 예측
predict = model.predict(testData_features)
predict

array(['diamond', 'diamond', 'diamond', ..., 'diamond', 'vip', 'diamond'],
      dtype=object)

# 5. 데이터 정리

In [10]:
### 예측결과를 데이터프레임으로 변환
predictData = pd.DataFrame(predict, columns = ["predict"])

### 테스트데이터 정답지 선택
labelData = csData.loc[testData_label.index,:]
labelData.reset_index(drop=True, inplace=True) 

### 에측결과 / 정답지 병합
finalReuslt = pd.concat( [labelData, predictData], axis=1) 

In [11]:
finalReuslt.head()

Unnamed: 0,balance,stock,label,labelcode,balance_norm,stock_norm,predict
0,744000000,38000000,diamond,1,0.927273,0.328,diamond
1,724000000,32000000,diamond,1,0.901299,0.232,diamond
2,704000000,27333333,diamond,1,0.875325,0.157333,diamond
3,240000000,30500000,normal,0,0.272727,0.208,normal
4,258000000,28500000,normal,0,0.296104,0.176,normal


# 6. 결과 검증

In [12]:
# 결과 테스트하기 
ac_score = accuracy_score(finalReuslt.label, 
                          finalReuslt.predict)
cl_report = classification_report(finalReuslt.label, 
                                  finalReuslt.predict)

## 결과 리포트하기
print("Accuracy =", ac_score)
print("result =\n", cl_report)

Accuracy = 0.9953333333333333
result =
               precision    recall  f1-score   support

     diamond       1.00      1.00      1.00      3483
      normal       0.99      0.99      0.99      1803
         vip       0.99      0.99      0.99       714

    accuracy                           1.00      6000
   macro avg       1.00      0.99      0.99      6000
weighted avg       1.00      1.00      1.00      6000



### [실습 - 모델변화]
기존 데이터(customer.csv)를 활용하여
SVM 로직 외에 
Decision Tree 알고리즘을 활용하여
구현하세요

### [실습 - 데이터변화]
github (hyokwan) 내 WA_Fn_UseC_-Sales-Win-Loss.csv 파일을 
불러와서 어떤 캠페인에서 loss가 발생하고 또는 win 할지
예측하는 로직을 구현하고
머신러닝_홍길동 이름의 파일로 저장 후
(haiteam@kopo.ac.kr) 로 메일 전송

In [13]:
from sklearn.externals import joblib



In [14]:
joblib.dump(model, "./trainingset.model")

['./trainingset.model']

In [15]:
loaded_model = joblib.load("./trainingset.model")

In [16]:
loaded_model.predict(feature_test)

NameError: name 'feature_test' is not defined