# Simaple Classifier 작성

In [1]:
import numpy as np
# 상속받아 단순한 Classifier 생성하는 패키지
from sklearn.base import BaseEstimator

# 단순 분류 모델 작성
class SimpleClassifier(BaseEstimator) :
    # fit() 메서드는 아무것도 학습하지 않음
    def fit(self, x, y = None) :
        pass
    # predict() 메서드는 단순히 Sex Feature가 1이면 0, 아니면 1로 예측
    def predict(self, x) :
        pred = np.zeros((x.shape[0], 1))
        for i in range(x.shape[0]) :
            if x['Sex'].iloc[i] == 1 :
                pred[i] = 0
            else :
                pred[i] = 1
        return pred

# 패키지

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 데이터

In [3]:
df_raw = pd.read_csv('./titanic.csv')
# copy
import copy
df = copy.deepcopy(df_raw)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# 데이터 전처리

### 결측치 처리

In [4]:
# Age 평균 대체
df['Age'].fillna(df['Age'].mean(), inplace = True)
# Cabin 값 'N' 대체
df['Cabin'].fillna('N', inplace = True)
# Cabin 값 첫 글자 대체
df['Cabin'] = df['Cabin'].str[:1]
# Embarked 'N' 값 대체
df['Embarked'].fillna('N', inplace = True)
# 결과 확인
df.isna().sum().sum()

0

### Encoding

In [5]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
# Encoding
def encode(data) :
    features = data.select_dtypes(include = 'object')
    encoder = LabelEncoder()
    for feature in features :
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

df = encode(df)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,7,3
1,2,1,1,190,0,38.0,1,0,596,71.2833,2,0
2,3,1,3,353,0,26.0,0,0,669,7.925,7,3
3,4,1,1,272,0,35.0,1,0,49,53.1,2,3
4,5,0,3,15,1,35.0,0,0,472,8.05,7,3


### 불필요한 변수 제거

In [6]:
df.drop(['PassengerId','Name','Ticket'], axis = 1, inplace = True)
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

# 데이터 분할

In [7]:
# train_test_split
from sklearn.model_selection import train_test_split
# 설명변수, 목표변수 데이터 구분
df_x = df.drop('Survived', axis = 1, inplace = False)
df_y = df['Survived']
# 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 0)
print('train data X size :', X_train.shape)
print('train data Y size :', Y_train.shape)
print('test data X size :', X_test.shape)
print('test data Y size :', Y_test.shape)

train data X size : (712, 8)
train data Y size : (712,)
test data X size : (179, 8)
test data Y size : (179,)


# Simple Classifier

In [8]:
# SimpleClassifier
classifier = SimpleClassifier()
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(X_test)
# 정확도
from sklearn.metrics import accuracy_score
print('Simple Classifier 정확도 : {:.4f}'.format(accuracy_score(Y_test, y_pred)))

Simple Classifier 정확도 : 0.7877


* 정확도는 불균형한 레이블 데이터 세트에서 성능 수치로 사용되면 안된다.

# Confusion Matrix

In [9]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test, y_pred)

array([[92, 18],
       [20, 49]], dtype=int64)