In [None]:
import pandas as pd
import numpy as np


from io import StringIO

import missingno as msno

import sklearn
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_validate, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, make_scorer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



import matplotlib.pyplot as plt

import seaborn as sns

In [None]:
### 분류모델의 성능평가
# 정확도 : 실 데이터와 예측 데이터가 얼마나 같은지를 판단하는 지표
# 문제점 : 이진분륜의 경우 모델의 성능을 왜곡할 수 있다
# 왜 : 데이터의 불균형
# 해결책 : F1 Score (조합평균) Precision, Recall

In [None]:
### 분류모델 성능평가를 위한 이해
print('TP(target = 1, predict = 1)')
print('TN(target = 0, predict = 0)')
print('FN(target = 1, predict = 0) -> type 1 error')
print('FP(target = 0, predict = 1) -> type 2 error')


target = [1,0,0,1,1,1,0,1,1,1]
prediction = [0,1,1,1,1,0,1,0,1,0]

tp = tn = fn = fp = 0

for idx in range(len(target)):
    if target[idx] == 1 and prediction[idx] == 1:
        tp += 1
    if target[idx] == 0 and prediction[idx] == 0:
        tn += 1
    if target[idx] == 1 and prediction[idx] == 0:
        fn += 1
    if target[idx] == 0 and prediction[idx] == 1:
        fp += 1
        
        
print()

print('TP(target = 1, predict = 1)', tp)
print('TN(target = 0, predict = 0)', tn)
print('FN(target = 1, predict = 0) -> type 1 error', fn)
print('FP(target = 0, predict = 1) -> type 2 error', fp)


In [None]:
print('accuracy - ', accuracy_score(target, prediction))
print('recall - ', recall_score(target, prediction))
print('precision - ', precision_score(target, prediction))
print('f1 - ', f1_score(target, prediction))
print()
print('confusion_matrix - \n', confusion_matrix(target, prediction))

In [None]:
# 정밀도(precision) : TP / (TP + FP)
# 상대적으로 정밀도가 더 중요한 지표인 경우의 모델? - 스펨메일
# 재현율(recall) : TP / (TP + FN) 
# 상대적으로 재현율 더 즁요한 지표인 경우의 모델 - 의학(암진단), 금용(사기판단)

In [None]:
titanic_frm = pd.read_csv('output/titanic_train.csv')

titanic_frm['Survived'].value_counts()
titanic_frm

In [None]:
## 데이터 분리 - 현재 알아보고자하는 데이터
titanic_target = titanic_frm['Survived']
titanic_feature = titanic_frm.drop(['Survived'], axis=1)

print('target type - ', type(titanic_target))
print('feature type - ', type(titanic_feature))

# titanic_target.values

In [None]:
# 전처리 요구 사항 불필요한 피처 제거 PassengerId, Name, Ticket

titanic_frm_drop = titanic_frm.drop([('PassengerId'), ('Name'), ('Ticket')], axis=1)

titanic_frm_drop

imputer = SimpleImputer(strategy='mean')
titanic_frm_drop['Age'] = imputer.fit_transform(titanic_frm_drop[['Age']])

# 결측값 처리 Age는 평균, Cabin는 N, Embarked는 N
titanic_frm_drop['Cabin'] = titanic['Cabin'].fillna('N')

titanic_frm_drop



# 레이블 인코딩 Cabin, Embarked 진행

In [None]:
def drop_features(frm) :
    frm.drop([('PassengerId'), ('Name'), ('Ticket')], axis=1, inplace=True)
    return frm


In [None]:
feature_subset = drop_features(titanic_feature)
feature_subset

In [None]:
# Age평균 Cabin, Embarked 'N'으로 채움 
def pre_processing(frm) :
    frm['Age'].fillna(frm['Age'].mean(), inplace=True)
    frm['Cabin'].fillna('N',inplace=True)
    frm['Embarked'].fillna('N', inplace=True)
    return frm

In [None]:
feature_subset = pre_processing(feature_subset)
feature_subset['Cabin'].values
feature_subset['Cabin'].str[:1]

In [None]:
def label_encoder(frm):
    frm['Cabin'] = frm['Cabin'].str[:1]
    features = ['Sex', 'Cabin', 'Embarked']
    
    for feature in features:
        encoder = LabelEncoder()
        frm[feature] = encoder.fit_transform(frm[feature])
        
        return frm


In [None]:
feature_subset = label_encoder(feature_subset)
feature_subset
msno.matrix(feature_subset)

In [None]:
# 학습데이터와 테스트데이터를 분리 하는 방법
X_train, X_test, y_train, y_test = train_test_split(feature_subset,
                                                    titanic_target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=200)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# 모델 만들기
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# 예측
lr_y_pred = lr_model.predict(X_test)

In [None]:
# 모델 만들기
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# 예측
dt_y_pred = dt_model.predict(X_test)

In [None]:
def metrics_eval(target, prediction):
    print('accuracy - ', accuracy_score(target, prediction))
    print('recall - ', recall_score(target, prediction))
    print('precision - ', precision_score(target, prediction))
    print('f1 - ', f1_score(target, prediction))
    print()
    print('confusion_matrix - \n', confusion_matrix(target, prediction))

In [None]:
print('LogisticRegression -' )
print()
metrice_eval(y_test, lr_y_pred)
print()
print('acc - ', (90+54)/(90+54+14+21))

In [None]:
print('DecisionTreeClassifier -' )
print()
metrice_eval(y_test, dt_y_pred)

In [None]:
## 교체 검증
fold = KFold(n_splits=20)

scoring = {
    'accuracy'  : make_scorer(accuracy_score),
    'precision' : make_scorer(precision_score), 
    'recall'    : make_scorer(recall_score), 
    'f1' : make_scorer(f1_score), 
}

result = cross_validate(lr_model, X_train, y_train, 
                        cv = fold,
                        scoring = scoring)
result.keys()

In [None]:
result['test_accuracy'].mean()

In [None]:
cancer = load_breast_cancer()
cancer.keys()

In [None]:
print("1. 프레임 만들기(feature, target) 포함")
print("2. target에 대한 균형 여부 확인")
print("3. 데이터 세트 분리")
print("4. RandomForestClassifier")
print("5. 평가지표 확인")

In [None]:
cancer_frm = pd.DataFrame(data    = cancer['data'] , 
                          columns = cancer['feature_names'])
cancer_frm['target'] = cancer['target']
cancer_frm

In [None]:
print('데이터의 균형 여부 확인 - ')
cancer_target = cancer_frm['target']
cancer_features =  cancer_frm.drop(['target'], axis = 1)
    


In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data,
                                                    cancer.target,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=200)


In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

In [None]:
rf_y_pred = rf_model.predict(X_test)

In [None]:
metrics_eval(y_test , rf_y_pred)

In [None]:
print('교차검증 - ')
fold = KFold(n_splits=20)

scoring = {
    'accuracy' : make_scorer(accuracy_score),
    'preecision' :  make_scorer(precision_score),
    'recall' : make_scorer(recall_score),
    'f1' : make_scorer(f1_score)
}

result = cross_validate(rf_model, X_train, y_train,
                        cv = fold, 
                        scoring = scoring
                       )
result.keys()

In [None]:
result['test_recall'].mean()

In [None]:
# 재현율을 높이기 위한 방법으로 GridSearchCV를 이용한 파라미터 튜닝!!
# n_mestimators -  tree 갯수를 의미
# max_featrures - 최대 선택할 피처의 수를 의미
# max_depth - 최대 선택할 트리의 깊이를 의미
    

In [None]:
param = {
    'n_estimators' : [50, 100, 150, 200] , 
    'max_features' : [6, 8, 15, 20] , 
    'max_depth'    : [4, 6, 8] 
}

grid_search_model = GridSearchCV(rf_model,
                                 param_grid = param,
                                 cv = 20,
                                 refit = True,
                                 scoring = 'recall')
grid_search_model.fit(X_train, y_train)

In [None]:
grid_search_model.cv_results_

In [None]:
grid_search_model.best_params_

In [None]:
grid_search_model.best_score_

In [None]:
matrix = [
    [-1,-1,2],
    [2,0,0],
    [0,1.1,1.2]
]

In [None]:
from sklearn.preprocessing import Binarizer

holder = Binarizer(threshold=1.1)
print(holder.fit_transform(matrix))

In [None]:
print("1. 데이터 로드")
print()
titanic_frm = pd.read_csv("output//titanic_train.csv")
titanic_frm.head()

In [None]:
print('2. target , feature로 데이터 분리')

titanic_target  =  titanic_frm['Survived']
titanic_feature =  titanic_frm.drop(['Survived'] , axis = 1) 

print('target  type - ' , type(titanic_target))
print('feature type - ' , type(titanic_feature))

In [None]:
feature_subset = drop_features(titanic_feature)
feature_subset = pre_processing(feature_subset)
feature_subset = label_encoder(feature_subset)
feature_subset

In [None]:
print('학습과 테스트 분리 - ')
print()
X_train , X_test , y_train , y_test = train_test_split(feature_subset , 
                                                       titanic_target , 
                                                       test_size = 0.2 , 
                                                       random_state = 100)

In [None]:
X_train.shape , X_test.shape , y_train.shape , y_test.shape

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

In [None]:
print('확률예측 값 - predict_proba()')
predict_proba_result = logistic_model.predict_proba(X_test)
print('type  - ' , type(predict_proba_result))
print('shape - ' , predict_proba_result.shape)
print('y_pred type  - ' , type(y_pred))
print('y_pred shape - ' , y_pred.shape)

In [None]:
predict_proba_result[0:3]

In [None]:
y_pred[:3]

In [None]:
pred_prob_concat = np.concatenate([predict_proba_result , y_pred.reshape(-1,1)] , axis=1)
pred_prob_concat[:10]

In [None]:
user_th = 0.3
predict_proba_positive = predict_proba_result[ : , 1].reshape(-1, 1)
user_pred = Binarizer(threshold = user_th) \
                .fit_transform(predict_proba_positive)

In [None]:
print('default - ')
print()
metrics_eval(y_test , y_pred)
print()
print('user th - 0.3 ')
print()
metrics_eval(y_test , user_pred)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
predict_proba_positive = predict_proba_result[ : , 1]
precision, recall, th  = precision_recall_curve(y_test , predict_proba_positive)
print('precision type - ' , type(precision))
print('recall    type - ' , type(recall))
print('th        type - ' , type(th))
print('precision shape - ' , precision.shape)
print('recall    shape - ' , recall.shape)
print('th        shape - ' , th.shape)

In [None]:
plt.figure(figsize=(15,5))


plt.plot(th, precision[ 0 : th.shape[0]] , linestyle="--", label="precision")
plt.plot(th, recall[ 0 : th.shape[0]]    , linestyle="-" , label="recall")

plt.xlabel('threshold ratio')
plt.ylabel('precision and recall value')

plt.grid()
plt.legend()
plt.show()
plt.close()