# 평가 함수 작성

In [1]:
def clf_eval(Y_test, y_pred) :
    cm = confusion_matrix(Y_test, y_pred)
    accuracy = accuracy_score(Y_test, y_pred)
    precision = precision_score(Y_test, y_pred)
    recall = recall_score(Y_test, y_pred)
    # 결과 출력
    print('오차행렬\n', cm)
    print('\n정확도 : {:.4f}\n정밀도 : {:.4f}\n재현율 : {:.4f}'.format(accuracy, precision, recall))

# 패키지

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 데이터

In [3]:
df_raw = pd.read_csv('./titanic.csv')
df_raw.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
# copy
import copy
df = copy.deepcopy(df_raw)
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


# 데이터 전처리

In [5]:
# 데이터 정보
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
# 데이터 요약 정보
df.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### 결측치 처리

In [7]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

##### Age 평균 대체

In [8]:
df['Age'].fillna(df['Age'].mean(), inplace = True)
df['Age'].isna().sum()

0

##### Cabin 'N' 값 대체

In [9]:
df['Cabin'].fillna('N', inplace = True)
df['Cabin'].value_counts()

N              687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: Cabin, Length: 148, dtype: int64

##### Cabin 첫 글자 대체

In [10]:
df['Cabin'] = df['Cabin'].str[:1]
df['Cabin'].value_counts()

N    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64

##### Embarked 'N' 값 대체

In [11]:
df['Embarked'].fillna('N', inplace = True)
df['Embarked'].value_counts()

S    644
C    168
Q     77
N      2
Name: Embarked, dtype: int64

##### 결과 확인

In [12]:
df.isna().sum().sum()

0

### Data Encoding

In [17]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
# Encoding
def encode(data) :
    features = data.select_dtypes(include = 'object')
    encoder = LabelEncoder()
    for feature in features :
        data[feature] = encoder.fit_transform(data[feature])
    
    return data

df = encode(df)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,7,3
1,2,1,1,190,0,38.0,1,0,596,71.2833,2,0
2,3,1,3,353,0,26.0,0,0,669,7.925,7,3
3,4,1,1,272,0,35.0,1,0,49,53.1,2,3
4,5,0,3,15,1,35.0,0,0,472,8.05,7,3


### 불필요한 변수 제거

In [18]:
df.drop(['PassengerId','Name','Ticket'], axis = 1, inplace = True)
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

# 모델링

### 데이터 분할

In [19]:
# train_test_split
from sklearn.model_selection import train_test_split
# 설명변수, 목표변수 데이터 구분
df_x = df.drop('Survived', axis = 1, inplace = False)
df_y = df['Survived']
# 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 11)
print('train data X size :', X_train.shape)
print('train data Y size :', Y_train.shape)
print('test data X size :', X_test.shape)
print('test data Y size :', Y_test.shape)

train data X size : (712, 8)
train data Y size : (712,)
test data X size : (179, 8)
test data Y size : (179,)


### Modeling

In [21]:
# 성능 평가
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Logistic Regression
from sklearn.linear_model import LogisticRegression as LR
lr = LR(solver = 'liblinear')
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
# 결과 확인
clf_eval(Y_test, y_pred)

오차행렬
 [[108  10]
 [ 14  47]]

정확도 : 0.8659
정밀도 : 0.8246
재현율 : 0.7705
