# Series 타입에 대한 이해
- numpy 타입의 확장된 형태

In [109]:
import numpy as np
import pandas as pd

## series 타입의 생성

In [110]:
series1 = pd.Series([1,2,3])
series1

0    1
1    2
2    3
dtype: int64

In [111]:
series1.shape

(3,)

- 그렇게 안보이지만 1차원 배열 형태와 동일
- 기본적으로는 배열과 동일하지만, 인덱스가 함게 표현

In [112]:
series1.index

RangeIndex(start=0, stop=3, step=1)

In [113]:
series1.values

array([1, 2, 3])

In [114]:
series2 = pd.Series([1,2,3], ['a', 'b', 'c'])
series2

a    1
b    2
c    3
dtype: int64

In [115]:
series2['a']

1

In [116]:
series2['a'] = 10

In [117]:
series2

a    10
b     2
c     3
dtype: int64

- 기본적으로는 배열과 동일하지만, 데이터를 다루기 편한 형태로 확장되었다. 

## 시리즈의 속성과 기능

In [118]:
sample = pd.Series([1,1,2,1,2,3,5,6,1,2,2,3,4,6,3,3,3,4,5,6,6,np.NaN])

In [119]:
print( len(sample) ) # 시리즈내의 데이터의 갯수
print(sample.ndim) # 시리즈의 차원
print(sample.shape) # 시리즈의 모양(크기)

22
1
(22,)


In [120]:
print(sample.size) # 시리즈내의 데이터의 갯수
print(sample.count()) # 결측치를 제외한 데이터의 갯수
print(sample.notna().sum()) #결측치를 제외한 데이터의 갯수('na'가 아닌 데이터의 전체 합)
print(sample.isna().sum())  #결측치의 갯수('na'인 데이터의 전체 합)
print(sample.unique()) # 중복된 값을 제외한 데이터
print(sample.value_counts()) # 각 유니크한 값들의 빈도(결측치는 제외)

22
21
21
1
[ 1.  2.  3.  5.  6.  4. nan]
3.0    5
6.0    4
2.0    4
1.0    4
4.0    2
5.0    2
dtype: int64


## 기초(기술) 통계량

In [121]:
# 분산과 표준편차는 데이터의 변동성
print(sample.var()) # 분산
print(sample.std()) # 표준편차

# 데이터의 중심에 대한 통계량
print(sample.mean()) # 결측치를 제외한 평균
print(sample.median()) # 중앙값

print(sample.min()) #최소값
print(sample.max()) #최대값
# 4분위 수
print(sample.quantile(.25)) #전체 데이터의 25%에 해당하는 값(1분위수)
print(sample.quantile(.50)) #전체 데이터의 50%에 해당하는 값(중앙값/2분위수)
print(sample.quantile(.75)) #전체 데이터의 75%에 해당하는 값(3분위수)
print(sample.describe()) # 통계정보

3.2142857142857144
1.7928429140015905
3.2857142857142856
3.0
1.0
6.0
2.0
3.0
5.0
count    21.000000
mean      3.285714
std       1.792843
min       1.000000
25%       2.000000
50%       3.000000
75%       5.000000
max       6.000000
dtype: float64


# DataFrame

## CSV(Comma Seperated Value)
- 엑셀파일이 아닙니다. 
- 일반 텍스트 파일인데 엑셀에서 해당 형식을 지원
- 데이터의 구분을 콤마를 사용해서 표현
    - 다른 특수문자를 구분자로 사용하는 것도 가능

In [122]:
train = pd.read_csv('./data/train.csv')

In [123]:
# train.head(2) # 상위 2개의 자료만 출력, 기본값은 5
train.tail(2) # 하위 2개의 자료만 출력, 기본값은 5

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


## DataFrame?
- 시리즈의 확장된 형태, 여러개의 1차원 시리즈가 모여서 2차원 데이터 프레임을 구성
- 각 열은 하나의 시리즈가 된다. 

In [124]:
print(train.ndim)
print(train.shape)
print(train.index)
print(train.values)
print(train.columns)

2
(891, 12)
RangeIndex(start=0, stop=891, step=1)
[[1 0 3 ... 7.25 nan 'S']
 [2 1 1 ... 71.2833 'C85' 'C']
 [3 1 3 ... 7.925 nan 'S']
 ...
 [889 0 3 ... 23.45 nan 'S']
 [890 1 1 ... 30.0 'C148' 'C']
 [891 0 3 ... 7.75 nan 'Q']]
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [125]:
# train.describe(include='all')
train.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
PassengerId,891,,,,446.0,257.354,1.0,223.5,446.0,668.5,891.0
Survived,891,,,,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891,,,,2.30864,0.836071,1.0,2.0,3.0,3.0,3.0
Name,891,891.0,"Boulos, Mr. Hanna",1.0,,,,,,,
Sex,891,2.0,male,577.0,,,,,,,
Age,714,,,,29.6991,14.5265,0.42,20.125,28.0,38.0,80.0
SibSp,891,,,,0.523008,1.10274,0.0,0.0,0.0,1.0,8.0
Parch,891,,,,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Ticket,891,681.0,1601,7.0,,,,,,,
Fare,891,,,,32.2042,49.6934,0.0,7.9104,14.4542,31.0,512.329


In [126]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## indexing
- 데이터 프레임의 인덱싱은 기본 '열' 인덱싱이다. 
- 즉, 시리즈를 기본으로 인덱싱을 한다.

In [127]:
# train['Survived']
train[['Survived', 'Sex']]

Unnamed: 0,Survived,Sex
0,0,male
1,1,female
2,1,female
3,1,female
4,0,male
...,...,...
886,0,male
887,1,female
888,0,female
889,1,male


## 행을 기준으로 index
- iloc
- loc

In [128]:
# slice(numpy와 동일)
# train.iloc[0:10, 1:]
train.loc[0:10, 'Survived':]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


## 조건 검색

In [129]:
# fancy indexing(boolean index)을 응용
# idx = train['Pclass'] == 1
# train.loc[idx, ['Survived']]
# train.loc[train['Pclass'] == 1, ['Survived', 'Sex']]
train.loc[(train['Pclass'] == 1) & (train['Sex'] == 'female'), 'Embarked'].value_counts()

S    48
C    43
Q     1
Name: Embarked, dtype: int64

In [130]:
# train['Cabin'].isnull()
# train['Cabin'].isna()
# 판다스의 데이터 프레임은 자료의 컬럼을 속성으로 갖는다. 
# train.Cabin.isna()

# train.loc[train.Cabin.notna()]
# train.loc[train.Cabin.notnull(), 'Pclass'].value_counts()
train.Pclass.value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [131]:
# train.loc[train.Embarked.isna()] # 흥미롭네요? 왜 티켓번호가 같죠?
# train.duplicated(subset=['Ticket'], keep=False)
train.loc[(train.duplicated(subset=['Ticket'], keep=False)) & \
          (train.SibSp == 0) & (train.Parch == 0)].sort_values(by=['Ticket'])
# train.Ticket.duplicated(keep=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
257,258,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,110152,86.500,B77,S
504,505,1,1,"Maioni, Miss. Roberta",female,16.0,0,0,110152,86.500,B79,S
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.500,B77,S
475,476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52.000,A14,S
110,111,0,1,"Porter, Mr. Walter Chamberlain",male,47.0,0,0,110465,52.000,C110,S
...,...,...,...,...,...,...,...,...,...,...,...,...
537,538,1,1,"LeRoy, Miss. Bertha",female,30.0,0,0,PC 17761,106.425,,C
772,773,0,2,"Mack, Mrs. (Mary)",female,57.0,0,0,S.O./P.P. 3,10.500,E77,S
841,842,0,2,"Mudd, Mr. Thomas Charles",male,16.0,0,0,S.O./P.P. 3,10.500,,S
385,386,0,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.500,,S


## 결측치 처리 하는 방법

In [132]:
# 삭제 
# train.Embarked.isna().sum()
# train.loc[train.Embarked.isna()]
train.dropna(subset=['Embarked']) # 추천! 결측치가 삭제된 새로운 DF을 돌려준다.
# train.dropna(subset=['Embarked'], inplace=True) # 주의! 현재 DF에서 삭제가 이루어진다.

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [133]:
# 결측치를 채우는 방법
# 그럼 뭘로 채우지? - 실수형 이라면, 평균, 중앙값, ... 
# train.Age.isna().sum() # 177개의 데이터를 전부 삭제해버리기에는 부담스럽단 말이죠 ... 
mean = train.Age.mean()
train.Age.fillna(mean, inplace=True)

## 그룹화

In [134]:
grouped = train.groupby(['Survived'])
grouped.size()

Survived
0    549
1    342
dtype: int64

In [135]:
grouped = train.groupby(['Survived', 'Sex', 'Pclass'])
print(grouped.size())
grouped.agg({
    'Age':'mean',
    'Fare': 'mean'
})

Survived  Sex     Pclass
0         female  1           3
                  2           6
                  3          72
          male    1          77
                  2          91
                  3         300
1         female  1          91
                  2          70
                  3          72
          male    1          45
                  2          17
                  3          47
dtype: int64


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Age,Fare
Survived,Sex,Pclass,Unnamed: 3_level_1,Unnamed: 4_level_1
0,female,1,25.666667,110.604167
0,female,2,36.0,18.25
0,female,3,25.206736,19.773093
0,male,1,41.489427,62.89491
0,male,2,33.086745,19.488965
0,male,3,27.948083,12.204469
1,female,1,34.420792,105.978159
1,female,2,28.127118,22.288989
1,female,3,22.930249,12.464526
1,male,1,35.520346,74.63732


## merge(join)
- 두 자료간의 데이터를 합치는 방법
- 여러 소스로부터 가져온 데이터를 분석하기 위해서는 반드시 하나의 데이터 프레임 이어야만 한다.

In [136]:
user_device = pd.read_csv('./data/user_device.csv')
user_usage = pd.read_csv('./data/user_usage.csv')

In [137]:
display( user_device.head(2) )
len(user_device)

Unnamed: 0,use_id,user_id,platform,platform_version,device,use_type_id
0,22782,26980,ios,10.2,"iPhone7,2",2
1,22783,29628,android,6.0,Nexus 5,3


272

In [138]:
display( user_usage.head(2) )
len(user_usage)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id
0,21.97,4.82,1557.33,22787
1,1710.08,136.88,7267.55,22788


240

### inner join
- 두 자료 모두에 존재하는 데이터만 합친다. 
- 교집합(A n B)
- 판다스의 기본값은 inner join이다. 
- 즉, 샘플에서는 'use_id'가 동일한 데이터만 합쳐진 것이다. 

In [139]:
result = pd.merge(left=user_usage, right=user_device, on='use_id')
display( result.head(2) )
len(result)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921,android,4.3,GT-I9505,1
1,1710.08,136.88,7267.55,22788,28714,android,6.0,SM-G930F,1


159

### left join
- left 자료를 기준으로 합쳐 줍니다. 
- A u (A n B)

In [140]:
result = pd.merge(left=user_usage, right=user_device, on='use_id', how='left')
display( result.head(2) )
len(result)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921.0,android,4.3,GT-I9505,1.0
1,1710.08,136.88,7267.55,22788,28714.0,android,6.0,SM-G930F,1.0


240

In [141]:
result.isna().sum()

outgoing_mins_per_month     0
outgoing_sms_per_month      0
monthly_mb                  0
use_id                      0
user_id                    81
platform                   81
platform_version           81
device                     81
use_type_id                81
dtype: int64

### full outer join
- 전체 자료를 중복과 상관없이 합쳐 줍니다. 
- A u B

In [142]:
result = pd.merge(left=user_usage, right=user_device, on='use_id', how='outer')
display( result.head(2) )
len(result)

Unnamed: 0,outgoing_mins_per_month,outgoing_sms_per_month,monthly_mb,use_id,user_id,platform,platform_version,device,use_type_id
0,21.97,4.82,1557.33,22787,12921.0,android,4.3,GT-I9505,1.0
1,1710.08,136.88,7267.55,22788,28714.0,android,6.0,SM-G930F,1.0


353

In [143]:
result.isna().sum()

outgoing_mins_per_month    113
outgoing_sms_per_month     113
monthly_mb                 113
use_id                       0
user_id                     81
platform                    81
platform_version            81
device                      81
use_type_id                 81
dtype: int64

# 데이터 전처리의 기초(붙이고, 자르고, 지우고, 채우고)
-  test2에 있는 survived 항목을 test로 join 합시다. 

In [144]:
test = pd.read_csv('./data/test.csv')
test2 = pd.read_csv('./data/titanic_sample.csv')

## 중복되는 항목 체크

In [145]:
test.loc[test.duplicated(subset=['Name'], keep=False)]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


- test에는 중복되는 항목이 없다.

In [146]:
test2.loc[test2.duplicated(subset=['name'], keep=False)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
725,3,1,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,?,Q,13,?,Ireland
726,3,0,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,?,Q,?,?,Ireland
924,3,0,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,?,Q,?,70,?
925,3,0,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,?,S,?,?,?


- `name`필드를 기준으로 조인을 할 것이기 때문에, 중복은 우선 제거
- `train` 자료에 존재하는 승객은 제거를 해주자.
- 그러, `train`자료로 부터 승객의 이름 조회해보면 다음과 같다. 

In [147]:
train.loc[(train['Name'] == 'Connolly, Miss. Kate') | (train['Name'] == 'Kelly, Mr. James')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
289,290,1,3,"Connolly, Miss. Kate",female,22.0,0,0,370373,7.75,,Q
696,697,0,3,"Kelly, Mr. James",male,44.0,0,0,363592,8.05,,S


-  22살의 'Connolly, Miss. Kate'와 44살의 'Kelly, Mr. James'는 `train` 자료에 존재하는 승객이므로, 
- `test2` 자료에서 제거를 해주자.

In [148]:
drop_idx = test2.loc[(test2['name'] == 'Kelly, Mr. James') & (test2['age'] == '44')].index
test2.drop(index=drop_idx, inplace=True)

In [149]:
drop_idx = test2.loc[(test2['name'] == 'Connolly, Miss. Kate') & (test2['age'] == '22')].index
test2.drop(index=drop_idx, inplace=True)

In [150]:
test2.loc[test2.duplicated(subset=['name'], keep=False)]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


- 삭제후 중복되는 항목을 다시 조회해보면 중복되는 항목이 없음을 확인할 수 있다. 

## 자르고

In [151]:
test2 = test2[['survived', 'name']]

In [152]:
test2

Unnamed: 0,survived,name
0,1,"Allen, Miss. Elisabeth Walton"
1,1,"Allison, Master. Hudson Trevor"
2,0,"Allison, Miss. Helen Loraine"
3,0,"Allison, Mr. Hudson Joshua Creighton"
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)"
...,...,...
1304,0,"Zabour, Miss. Hileni"
1305,0,"Zabour, Miss. Thamine"
1306,0,"Zakarian, Mr. Mapriededer"
1307,0,"Zakarian, Mr. Ortin"


## 합치고

In [180]:
testMerge = pd.merge(left=test, right=test2, left_on='Name', right_on='name', how='left')

In [181]:
testMerge

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,survived,name
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0,"Kelly, Mr. James"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1.0,"Wilkes, Mrs. James (Ellen Needs)"
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0,"Myles, Mr. Thomas Francis"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0,"Wirz, Mr. Albert"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1.0,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0.0,"Spector, Mr. Woolf"
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1.0,"Oliva y Ocana, Dona. Fermina"
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0.0,"Saether, Mr. Simon Sivertsen"
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0.0,"Ware, Mr. Frederick"


## 지우고

In [182]:
testMerge.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
survived        24
name            24
dtype: int64

- `name`과 `survived` 항목에 결측치가 있음을 확인

In [183]:
testMerge.loc[testMerge['name'].isna()].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,survived,name
19,911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45.0,0,0,2696,7.225,,C,,
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S,,
35,927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C,,
49,941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36.0,0,2,C.A. 37671,15.9,,S,,
52,944,2,"Hocking, Miss. Ellen Nellie""""",female,20.0,2,1,29105,23.0,,S,,


- `survived`항목과 `name` 항목이 비어 있음을 알 수 있다. 
- 왜 합쳐지지 않았는지 확인해보자

In [184]:
test2.loc[test2['name'].str.contains('Assaf Khalil')]

Unnamed: 0,survived,name
647,1,"Assaf Khalil, Mrs. Mariana ('Miriam')"


- `test`자료와 `test2` 자료의 이름이 다르게 들어가 있음을 알 수 있다. 
- 자료를 수집하는 과정에서 발생한 오류였는지 `test`자료의 이름이 틀린건지는 확인할 수 없다. 
- 이런 경우는 문자열을 하나하나 바로 잡는 것보다 지우는게 훨씬 쉽다. 

In [189]:
testMerge.dropna(subset=['name'], inplace=True)
testMerge.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             78
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          304
Embarked         0
survived         0
name             0
dtype: int64

## 마무리 
- 자료를 합치는 과정에서 발생한 불피요한 컬럼은 제거해주고, 
- 새로운 자료의 타입을 변환하고, 네이밍 컨벤션을 통일 시켜줍시다. 

In [191]:
testMerge.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,survived,name
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0,"Kelly, Mr. James"
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1.0,"Wilkes, Mrs. James (Ellen Needs)"
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0,"Myles, Mr. Thomas Francis"
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0,"Wirz, Mr. Albert"
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1.0,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"


- `name` 항목은 더 이상 불필요 하므로, 제거해주자. 

In [192]:
testMerge.drop(columns=['name'], inplace=True)
testMerge.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0.0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0.0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0.0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1.0


- `survived` 항목을 `train` 데이터의 `Survived` 항목과 타입을 일치시켜 주자 

In [195]:
testMerge['survived'] = testMerge['survived'].astype(np.int64)
testMerge.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


- 마지막으로 `survived` 항목의 이름을 대문자로 시작하도록 변경해주자. 

In [196]:
testMerge.rename(columns={'survived':'Survived'}, inplace=True)
testMerge.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [199]:
testMerge.to_csv('./data/test_with_label.csv', encoding='utf-8', index=False)