## 데이터 전체 프로세스 (실습) - 타이타닉 데이터

I. 라이브러리 선언 및 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [3]:
train_df = pd.read_csv('train.csv', encoding = 'utf-8')
test_df = pd.read_csv('test.csv', encoding = 'utf-8')

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
train_df.describe(include = 'all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


II. 기초가공

 - SibSp 와 Parch 를 더해서 Family를 만든다 
 - 1인 가구도 있으니 Family값에는 1을 더한다

In [19]:
# 기초가공 : Family 변수 생성

train_df2 = train_df.copy() 

def get_family(df) :
    df['Family'] = df['SibSp'] + df['Parch'] + 1
    return df

get_family(train_df2).head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1


III. 이상치 제거

 - Fare 항목에 500이 넘는 값은 제외한다
 - train_df2에 조건을 걸어 500이하의 값만 남긴다
 - 제거 후 describe와 histplot 확인

In [None]:
# 수치형 데이터 분포 확인

sns.pairplot(train_df2[['Age', 'Fare', 'Family']])

In [31]:
# 이상치 정리 (Fare가 500보다 작은 값만 남기기)

train_df2 = train_df2[train_df2['Fare'] < 512]
train_df2[['Fare']].describe()

Unnamed: 0,Fare
count,888.0
mean,30.582164
std,41.176366
min,0.0
25%,7.8958
50%,14.4542
75%,30.77185
max,263.0


IV. 결측치 제거

 - Age에 없는 값들은 평균으로 넣어준다 (mean으로 평균값 확인) / 수치형 데이터
 - Embarked의 경우 S 항목이 가장 많았기 때문에 S로 대체 할 예정 / 범주형 데이터

In [37]:
# 수치형 데이터

def get_non_missing(df) :
    Age_mean = train_df2['Age'].mean()
    df['Age'] = df['Age'].fillna(Age_mean)
    df['Emabarked'] = df['Embarked'].fillna('S')

get_non_missing(train_df2)

train_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 888 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  888 non-null    int64  
 1   Survived     888 non-null    int64  
 2   Pclass       888 non-null    int64  
 3   Name         888 non-null    object 
 4   Sex          888 non-null    object 
 5   Age          888 non-null    float64
 6   SibSp        888 non-null    int64  
 7   Parch        888 non-null    int64  
 8   Ticket       888 non-null    object 
 9   Fare         888 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     886 non-null    object 
 12  Family       888 non-null    int64  
 13  Emabarked    888 non-null    object 
dtypes: float64(2), int64(6), object(6)
memory usage: 104.1+ KB


V. 전처리

 - 범주형 데이터 (Encoding)
   - 레이블 인코딩 Pclass, Sex
   - 원핫 인코딩 Embarked

- 수치형 데이터 (Scaling)
    - Age, Ticket, Family(Sibsp + Parch)
    - MinMaxScaling, StandardScaling

In [54]:
# 수치형 데이터 Scaling 

def get_numeric_sc(df) :
    
    # sd_sc : Fare, mm_sc : Age, Family
    
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    
    sd_sc = StandardScaler()
    mm_sc = MinMaxScaler()
    
    sd_sc.fit(df[['Fare']])
    df['Fare_sc'] = sd_sc.transform(df[['Fare']])
    
    mm_sc.fit(df[['Age', 'Family']])
    df[['Age_sc', 'Family_sc']] = mm_sc.transform(df[['Age', 'Family']])

In [42]:
# 범주형 데이터 Encoding
#   - 레이블 인코딩 Pclass, Sex
#   - 원핫 인코딩 Embarked

In [58]:
def get_category(df) :
    
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    
    le = LabelEncoder()
    le2 = LabelEncoder()
    
    le.fit(df[['Pclass']])
    df['Pclass_le'] = le.transform(df[['Pclass']])
    
    le2.fit(df[['Sex']])
    df['Sex_le'] = le2.transform(df[['Sex']])       
    
    
    
    oe = OneHotEncoder()
    
    df = df.reset_index()

    
    oe.fit(df[['Embarked']])
    embarked_csr = oe.transform(df[['Embarked']])
    
    # OneHotEncoder 적용 후 CSR값 때문에 데이터프레임으로 만들어주는 작업 필요
    embarked_csr_df = pd.DataFrame(embarked_csr.toarray(), columns = oe.get_feature_names_out())
    
    # 기존 데이터프레임과 신생 데이터프레임 concat
    df = pd.concat([df, embarked_csr_df], axis = 1)
    
    
    
    return df
    
    
train_df2 = get_category(train_df2)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [59]:
train_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         888 non-null    int64  
 1   PassengerId   888 non-null    int64  
 2   Survived      888 non-null    int64  
 3   Pclass        888 non-null    int64  
 4   Name          888 non-null    object 
 5   Sex           888 non-null    object 
 6   Age           888 non-null    float64
 7   SibSp         888 non-null    int64  
 8   Parch         888 non-null    int64  
 9   Ticket        888 non-null    object 
 10  Fare          888 non-null    float64
 11  Cabin         202 non-null    object 
 12  Embarked      886 non-null    object 
 13  Family        888 non-null    int64  
 14  Emabarked     888 non-null    object 
 15  Fare_sc       888 non-null    float64
 16  Age_sc        888 non-null    float64
 17  Family_sc     888 non-null    float64
 18  Pclass_le     888 non-null    

VI. 예측모델 생성

In [62]:
def get_model(df) :
    
    from sklearn.linear_model import LogisticRegression
    
    model_lor = LogisticRegression()
    
    X = df[['Age_sc', 'Fare_sc', 'Family_sc', 'Pclass_le', 'Sex_le', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    y = df[['Survived']]
    
    model_lor.fit(X, y)
    
    return model_lor
    

In [64]:
# 함수를 활용한 예측모델 생성

model_output = get_model(train_df2)
model_output

  y = column_or_1d(y, warn=True)


In [71]:
# 예측모델 만들기
X = train_df2[['Age_sc', 'Fare_sc', 'Family_sc', 'Pclass_le', 'Sex_le', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y_true = train_df2[['Survived']]
y_pred = model_output.predict(X)

In [72]:
# 예측모델 평가하기 

from sklearn.metrics import accuracy_score, f1_score

def get_metrics(true, pred) :
    print('정확도', accuracy_score(true, pred))
    print('f1스코어', f1_score(true, pred))

get_metrics(y_true, y_pred)

정확도 0.8018018018018018
f1스코어 0.7300613496932515
