In [96]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [97]:
# 데이터 읽기
titanic_data = pd.read_csv("data-files/titanic-train.csv", encoding="utf-8")
original_titanic_data = titanic_data.copy()

In [98]:
# 요약 정보 보기
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [99]:
titanic_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [100]:
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [101]:
# 컬럼(속성)별로 결측치 (missing value) 확인
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [102]:
print( titanic_data.shape[0] ) # 행(관측치)의 개수
titanic_data.isnull().sum() / titanic_data.shape[0]

891


PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [103]:
# 결측치 처리 1 : 열삭제
titanic_data.drop("Cabin", axis=1, inplace=True)


In [104]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB


In [105]:
# 결측치 처리 2 : 행삭제
boolean_indexer = ~titanic_data["Embarked"].isnull()
titanic_data = titanic_data[boolean_indexer]

In [106]:
titanic_data.shape

(889, 11)

In [107]:
# meanv = titanic_data["Age"].mean()
# medianv = titanic_data["Age"].median()
# maxv = titanic_data["Age"].max()
# minv = titanic_data["Age"].min()
# meanv, medianv, maxv, minv

subv = titanic_data["Age"].agg([np.mean, np.median, np.max, np.min])
subv

mean      29.642093
median    28.000000
amax      80.000000
amin       0.420000
Name: Age, dtype: float64

In [108]:
# 결측치 처리 3 : 대체값 사용
subv = titanic_data["Age"].agg([np.mean, np.median])

titanic_data["Age"].fillna(subv['mean'], inplace=True)

In [109]:
titanic_data['Age'].isna().sum()

0

In [110]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [111]:
# index 초기화
# titanic_data.reset_index() # 원래 인덱스 데이터를 일반 컬럼으로 이동
titanic_data.reset_index(drop=True, inplace=True) # drop=True : 원래 인덱스 데이터는 제거

In [112]:
titanic_data.index
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.5+ KB


In [113]:
# 불필요한 컬럼 제거
titanic_data.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)

In [114]:
titanic_data.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

In [115]:
# (분석에 효과적인) 새로운 컬럼 추가
titanic_data["Parch"] + titanic_data["SibSp"]
titanic_data[['Parch', "SibSp"]].sum(axis=1)

# titanic_data["FamilySize"] = titanic_data["Parch"] + titanic_data["SibSp"]
titanic_data["FamilySize"] = titanic_data[['Parch', "SibSp"]].sum(axis=1)

In [116]:
titanic_data['FamilySize']

0      1
1      1
2      0
3      1
4      0
      ..
884    0
885    0
886    3
887    0
888    0
Name: FamilySize, Length: 889, dtype: int64

In [117]:
# 불필요한 컬럼 제거
titanic_data.drop(["Parch", "SibSp"], axis=1, inplace=True)

In [118]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,FamilySize
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,1
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,1
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,0


In [119]:
# Pclass별 생존 데이터

titanic_data['Pclass'].unique() # unique : 컬럼의 값을 중복을 제거하고 반환
titanic_data['Pclass'].value_counts() # 각 값별로 개수를 반환

titanic_data.groupby("Pclass")["Survived"].sum() # Pclass 별로 Survived의 합계 계산
titanic_data.groupby("Pclass")["Survived"].mean() # Pclass 별로 Survived의 평균 계산 (생존률)


Pclass
1    0.626168
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [120]:
# 성별 생존 데이터
titanic_data['Sex'].value_counts()
titanic_data.groupby('Sex')["Survived"].count()

titanic_data.groupby('Sex')["Survived"].mean()

Sex
female    0.740385
male      0.188908
Name: Survived, dtype: float64

In [121]:
# 나이별 생존 데이터
titanic_data.groupby("Age")["Survived"].mean()

# pd.cut : 범주를 나누고 각 데이터에 해당하는 범주 할당
age_group = pd.cut(titanic_data["Age"], bins=[0, 15, 50, 200], labels=["child", "adult", "elder"])
titanic_data["AgeGroup"] = age_group

titanic_data.groupby("AgeGroup")["Survived"].mean()

AgeGroup
child    0.590361
adult    0.363392
elder    0.333333
Name: Survived, dtype: float64

In [122]:
# Embarked별 생존 데이터

titanic_data["Embarked"].value_counts()

titanic_data.groupby("Embarked")["Survived"].mean()

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64

In [123]:
# 가족규모별 생존 데이터

titanic_data["FamilySize"].value_counts()

titanic_data.groupby("FamilySize")["Survived"].mean()

titanic_data["Single"] = titanic_data["FamilySize"].map(lambda v: "Single" if v == 0 else "Family")
titanic_data.groupby("Single")["Survived"].mean()

Single
Family    0.505650
Single    0.300935
Name: Survived, dtype: float64

In [124]:
# 요금(Fare)별 생존 데이터
titanic_data["Fare"].describe()

fare_group = pd.cut(titanic_data["Fare"], bins=[0, 8, 31, 600], labels=["low", "middle", "high"])
fare_group.value_counts()
titanic_data["FareGroup"] = fare_group

titanic_data.groupby("FareGroup")["Survived"].mean()

FareGroup
low       0.225664
middle    0.376168
high      0.577273
Name: Survived, dtype: float64

In [125]:
titanic_data.groupby(["Pclass", "Survived"])["Fare"].mean()

Pclass  Survived
1       0           64.684007
        1           95.840984
2       0           19.412328
        1           22.055700
3       0           13.669364
        1           13.694887
Name: Fare, dtype: float64

In [126]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,FamilySize,AgeGroup,Single,FareGroup
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,1,adult,Family,low
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,1,adult,Family,high
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,0,adult,Single,low
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,1,adult,Family,high
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,0,adult,Single,middle


In [127]:
# 나이 결측치를 전체평균으로 대체 -> 부분 그룹별 평균으로 대체
titanic_data["Name"]
title = titanic_data["Name"].str.extract("([A-Za-z]+)\.") # .앞에 1개 이상의 영문자 집합 추출
titanic_data["Title"] = title

In [128]:
titanic_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,FamilySize,AgeGroup,Single,FareGroup,Title
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,1,adult,Family,low,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,1,adult,Family,high,Mrs
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,0,adult,Single,low,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,1,adult,Family,high,Mrs
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,0,adult,Single,middle,Mr


In [129]:
age_means = titanic_data.groupby("Title")["Age"].mean()
age_means

age_means_dict = age_means.to_dict()
age_means_dict


{'Capt': 70.0,
 'Col': 58.0,
 'Countess': 33.0,
 'Don': 40.0,
 'Dr': 40.23458467094703,
 'Jonkheer': 38.0,
 'Lady': 48.0,
 'Major': 48.5,
 'Master': 7.080959269662921,
 'Miss': 23.2492560059594,
 'Mlle': 24.0,
 'Mme': 24.0,
 'Mr': 31.740636423402076,
 'Mrs': 34.82996432131207,
 'Ms': 28.0,
 'Rev': 43.166666666666664,
 'Sir': 49.0}

In [130]:
# Title 이름 -> age 평균 변환 테스트
titanic_data["Title"]
# titanic_data["Title"].map(lambda v: age_means_dict[v])
titanic_data["Title"].map(age_means_dict) # map 함수에는 dict 전달하면 key에 해당하는 값 반환
#age_means_dict

0      31.740636
1      34.829964
2      23.249256
3      34.829964
4      31.740636
         ...    
884    43.166667
885    23.249256
886    23.249256
887    31.740636
888    31.740636
Name: Title, Length: 889, dtype: float64

In [141]:
# 처음에 Age가 nan이었던 행 찾기
na_indexer = original_titanic_data["Age"].isna()
original_titanic_data[na_indexer] # Age가 na인 행 뽑기
age_na_index = original_titanic_data[na_indexer].index # age가 na인 행 번호 뽑기

# Age가 na인 곳에 title에 해당하는 평균 나이 저장
titanic_data.loc[age_na_index, "Age"] = titanic_data.loc[age_na_index, "Title"].map(age_means_dict)

In [146]:
titanic_data["Imputed"] = 0

titanic_data.loc[age_na_index, "Imputed"] = 1

titanic_data.head(10)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Embarked,FamilySize,AgeGroup,Single,FareGroup,Title,Imputed
0,0,3,"Braund, Mr. Owen Harris",male,22.0,7.25,S,1,adult,Family,low,Mr,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,71.2833,C,1,adult,Family,high,Mrs,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,7.925,S,0,adult,Single,low,Miss,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,53.1,S,1,adult,Family,high,Mrs,0
4,0,3,"Allen, Mr. William Henry",male,35.0,8.05,S,0,adult,Single,middle,Mr,0
5,0,3,"Moran, Mr. James",male,31.740636,8.4583,Q,0,adult,Single,middle,Mr,1
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,51.8625,S,0,elder,Single,high,Mr,0
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,21.075,S,4,child,Family,middle,Master,0
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,11.1333,S,2,adult,Family,middle,Mrs,0
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,30.0708,C,1,child,Family,middle,Mrs,0
