In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic-dl/train.csv
/kaggle/input/titanic-dl/test.csv


# 라이브러리 및 데이터 임포트

In [2]:
import numpy as np
import pandas as pd
import torch
import warnings 
warnings.filterwarnings('ignore')

In [3]:
df_train = pd.read_csv('/kaggle/input/titanic-dl/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic-dl/test.csv')

In [4]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

# Data Imputation

In [5]:
# 전반적인 것을 확인하기 위해 concat 진행
data_total = pd.concat([df_train, df_test])

In [6]:
data_total.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [7]:
data_total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


- Titanic Dataset에서 AGE는 생존을 결정하는 중요한 요소이며, 특히 연령대는 일반적으로 어린이는 생존 확률이 높고 노인은 생존 확률이 낮다고 논의되어 옴
- 따라서 AGE를 추가적으로 보강해줄 예정!

## AGE 보강 방법
- PClass가 같은 사람끼리 평균 연령이 더 유사하지 않을까?
- 성별과 등급을 기준으로 사람을 그룹화하면 어떨까?
- 특히, Name에서 Mr과 Master는 남성과 남아 아동, Mrs.와 Miss.는 기혼 여성과 미혼 여성을 의미함
    - 이를 사용해서 분석해보자!

In [8]:
display(data_total[(data_total.Age.isnull()) & (data_total.Name.str.contains('Master'))])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
65,66,1.0,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
159,160,0.0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
176,177,0.0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S
709,710,1.0,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
244,1136,,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S
339,1231,,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C
344,1236,,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
417,1309,,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [9]:
np.mean(data_total.Age)

29.881137667304014

- Index: 159
    - 8 Children, pClass == 3, Male
    - 어린이임에도 불구하고, 그냥 평균값으로 넣었다면 약 30세로 들어갔을 것임

In [10]:
print(df_train[df_train.Name.str.contains('Master')]['Age'].mean())

4.574166666666667


- 테스트 데이터에서 훈련 데이터로의 정보 유출을 방지하기 위해 train_data만 사용해줌
- 필터를 사용하여 Name에 'Master'가 포함된 모든 행을 가져와 .mean() 함수를 사용하여 Pclass 3에 있는 모든 남자 어린이의 평균 연령 값을 구해줌

In [11]:
display((data_total[(data_total.Age.isnull()) & (data_total.Name.str.contains('Master')) & (data_total.Parch==0)]))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
339,1231,,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C


- parch: 본인 제외 승선한 부모/자식의 총 인원 수
    - Master인 경우, 0이 아닌 수를 가져야만 하는데 혹시 모를 경우를 위해 확인!
    - 부모가 없이 여행하는 경우가 단 한 건 있어서, 이 사람은 Master의 최대값인 14를 넣어줌

In [12]:
df_test.loc[df_test.PassengerId==1231,'Age']=14

In [13]:
df_train['Title'], df_test['Title'] = [df.Name.str.extract \
        (' ([A-Za-z]+)\.', expand=False) for df in [df_train, df_test]]

In [14]:
df_train.groupby(['Title', 'Pclass'])['Age'].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Title,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Capt,1,70.0,1
Col,1,58.0,2
Countess,1,33.0,1
Don,1,40.0,1
Dr,1,43.75,4
Dr,2,38.5,2
Jonkheer,1,38.0,1
Lady,1,48.0,1
Major,1,48.5,2
Master,1,5.306667,3


- Title이 너무 많음, 중요한 타이틀만 만들기!

In [15]:
TitleDict = {"Capt": "Officer","Col": "Officer","Major": "Officer","Jonkheer": "Royalty", \
             "Don": "Royalty", "Sir" : "Royalty","Dr": "Royalty","Rev": "Royalty", \
             "Countess":"Royalty", "Mme": "Mrs", "Mlle": "Miss", "Ms": "Mrs","Mr" : "Mr", \
             "Mrs" : "Mrs","Miss" : "Miss","Master" : "Master","Lady" : "Royalty"}

- Dr와 Rev는 정확하게 Royalty는 아니지만, 가능한 한 연령대로 일치시키려고 함

In [16]:
df_train['Title'], df_test['Title'] = [df.Title.map(TitleDict) for df in [df_train, df_test]]

##Let us now reprint the groups
df_train.groupby(['Title', 'Pclass'])['Age'].agg(['mean', 'count'])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
Title,Pclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Master,1,5.306667,3
Master,2,2.258889,9
Master,3,5.350833,24
Miss,1,29.744681,47
Miss,2,22.390625,32
Miss,3,16.123188,69
Mr,1,41.58046,87
Mr,2,32.768293,82
Mr,3,28.724891,229
Mrs,1,40.4,35


In [17]:
# 모든 타이틀이.. 특히 테스트 데이터에 적용되는지 확인!
combined = pd.concat([df_train, df_test], axis=0)
display(df_train[df_train.Title.isnull()])
display(df_test[df_test.Title.isnull()])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C,


In [18]:
##There is Dona which is royalty which is not covered in test_data. Update the same
df_test.at[414,'Title'] = 'Royalty'

- 여성 어린이를 알 수 있는 방법이 없음
    - 'Miss'인데, parch 플래그가 0이상이면 여성 어린이일 가능성이 높음

In [19]:
print ("Avg age of 'Miss' Title:", round(df_train[df_train.Title=="Miss"]['Age'].mean()))

print ("Avg age of 'Miss' Title travelling without Parents:", round(df_train[(df_train.Title=="Miss") & (df_train.Parch==0)]['Age'].mean()))

print ("Avg age of 'Miss' Title travelling with Parents:", round(df_train[(df_train.Title=="Miss") & (df_train.Parch!=0)]['Age'].mean()), '\n')

Avg age of 'Miss' Title: 22
Avg age of 'Miss' Title travelling without Parents: 28
Avg age of 'Miss' Title travelling with Parents: 12 



- 차이가 굉장히 큼.....

In [20]:
display(combined[combined.Fare.isnull()])

## 인당 요금 확인
for df in [df_train, df_test, combined]:
    df['PeopleInTicket']=df['Ticket'].map(combined['Ticket'].value_counts())
    df['FarePerPerson']=df['Fare']/df['PeopleInTicket']

## Port S와 PClass의 평균 요금 사용해서 채우기
print('Mean fare for this category: ', df_train[(df_train.Embarked=='S') & (df_train.Pclass==3)]['FarePerPerson'].mean())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
152,1044,,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S,Mr


Mean fare for this category:  7.382647773383368


In [21]:
df_test.loc[df_test.Fare.isnull(), ['Fare','FarePerPerson']] = round(df_train[(df_train.Embarked=='S') & (df_train.Pclass==3) & (df_train.PeopleInTicket==1)]['Fare'].mean(),1)

In [22]:
display(combined[combined.Embarked.isnull()])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,PeopleInTicket,FarePerPerson
61,62,1.0,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,Miss,2,40.0
829,830,1.0,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,Mrs,2,40.0


In [23]:
df_train[(df_train.Pclass==1)].groupby('Embarked').agg({'FarePerPerson': 'mean', 'Fare': 'mean', 'PassengerId': 'count'})

Unnamed: 0_level_0,FarePerPerson,Fare,PassengerId
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,39.179223,104.718529,85
Q,30.0,90.0,2
S,30.211849,70.364862,127


In [24]:
df_train[(df_train.Pclass==1) & (df_train.PeopleInTicket==2) & (df_train.Age>18)].groupby('Embarked').agg({'FarePerPerson': 'mean', 'Fare': 'mean', 'PassengerId': 'count'})

Unnamed: 0_level_0,FarePerPerson,Fare,PassengerId
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,35.073648,70.147296,23
S,31.702411,63.404822,32


In [25]:
print(df_train[(~df_train.Cabin.isnull()) & (df_train.Pclass==1) & (df_train.PeopleInTicket==2) & (df_train.Sex=="female") & (df_train.Age>18)].groupby('Embarked').agg({'FarePerPerson': 'mean', 'Fare': 'mean', 'PassengerId': 'count'}))

##Still port C comes out as a winner in all cases. We will go ahead with this
df_train.Embarked.fillna('C', inplace=True)

          FarePerPerson      Fare  PassengerId
Embarked                                      
C              36.19730  72.39460           10
S              31.61042  63.22084           10


In [26]:
print(df_train.groupby(['Pclass','Sex','Title'])['Age'].agg({'mean', 'median', 'count'}))

for df in [df_train, df_test, combined]:
    df.loc[(df.Title=='Miss') & (df.Parch!=0) & (df.PeopleInTicket>1), 'Title']="FemaleChild"

display(combined[(combined.Age.isnull()) & (combined.Title=='FemaleChild')])

                       median       mean  count
Pclass Sex    Title                            
1      female Miss       30.0  29.744681     47
              Mrs        40.0  40.400000     35
              Royalty    48.0  43.333333      3
       male   Master      4.0   5.306667      3
              Mr         40.0  41.580460     87
              Officer    56.0  56.600000      5
              Royalty    42.0  42.166667      6
2      female Miss       24.0  22.390625     32
              Mrs        31.5  33.547619     42
       male   Master      1.0   2.258889      9
              Mr         31.0  32.768293     82
              Royalty    46.5  42.000000      8
3      female Miss       18.0  16.123188     69
              Mrs        31.0  33.515152     33
       male   Master      4.0   5.350833     24
              Mr         26.0  28.724891    229


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,PeopleInTicket,FarePerPerson
128,129,1.0,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C,FemaleChild,3,7.452767
180,181,0.0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S,FemaleChild,11,6.322727
229,230,0.0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S,FemaleChild,5,5.09334
409,410,0.0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S,FemaleChild,5,5.09334
485,486,0.0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S,FemaleChild,5,5.09334
792,793,0.0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S,FemaleChild,11,6.322727
863,864,0.0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S,FemaleChild,11,6.322727
888,889,0.0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,FemaleChild,4,5.8625
188,1080,,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S,FemaleChild,11,6.322727


In [27]:
grp = df_train.groupby(['Pclass','Sex','Title'])['Age'].mean()
print(grp)

Pclass  Sex     Title      
1       female  FemaleChild    21.071429
                Miss           33.424242
                Mrs            40.400000
                Royalty        43.333333
        male    Master          5.306667
                Mr             41.580460
                Officer        56.600000
                Royalty        42.166667
2       female  FemaleChild     9.916667
                Miss           29.875000
                Mrs            33.547619
        male    Master          2.258889
                Mr             32.768293
                Royalty        42.000000
3       female  FemaleChild     6.500000
                Miss           21.590909
                Mrs            33.515152
        male    Master          5.350833
                Mr             28.724891
Name: Age, dtype: float64


In [28]:
print('\n', 'This so called lookup table is actually similar to a list: ', type(grp))


 This so called lookup table is actually similar to a list:  <class 'pandas.core.series.Series'>


In [29]:
grp = df_train.groupby(['Pclass','Sex','Title'])['Age'].mean().reset_index()[['Sex', 'Pclass', 'Title', 'Age']]

print('\n', 'We converted the series object to: ', type(grp))


 We converted the series object to:  <class 'pandas.core.frame.DataFrame'>


In [30]:
print('\n', 'Lookup works like a charm now but not quite: ', grp[(grp.Pclass==2) & (grp.Sex=='male') & (grp.Title=='Master')]['Age'])


 Lookup works like a charm now but not quite:  11    2.258889
Name: Age, dtype: float64


In [31]:
print('\n', 'Aah! Perfect: ', grp[(grp.Pclass==2) & (grp.Sex=='male') & (grp.Title=='Master')]['Age'].values[0])


 Aah! Perfect:  2.2588888888888885


In [32]:
def fill_age(x):
    return grp[(grp.Pclass==x.Pclass)&(grp.Sex==x.Sex)&(grp.Title==x.Title)]['Age'].values[0]
##Here 'x' is the row containing the missing age. We look up the row's Pclass
##Sex and Title against the lookup table as shown previously and return the Age
##Now we have to call this fill_age function for every missing row for test, train

df_train['Age'], df_test['Age'] = [df.apply(lambda x: fill_age(x) if np.isnan(x['Age']) else x['Age'], axis=1) for df in [df_train, df_test]]
##This line is explained in the next cell

##End by combining the test and training data
combined= pd.concat([df_train,df_test], axis=0)

In [33]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     1309 non-null   int64  
 1   Survived        891 non-null    float64
 2   Pclass          1309 non-null   int64  
 3   Name            1309 non-null   object 
 4   Sex             1309 non-null   object 
 5   Age             1309 non-null   float64
 6   SibSp           1309 non-null   int64  
 7   Parch           1309 non-null   int64  
 8   Ticket          1309 non-null   object 
 9   Fare            1309 non-null   float64
 10  Cabin           295 non-null    object 
 11  Embarked        1309 non-null   object 
 12  Title           1309 non-null   object 
 13  PeopleInTicket  1309 non-null   int64  
 14  FarePerPerson   1309 non-null   float64
dtypes: float64(4), int64(5), object(6)
memory usage: 163.6+ KB


In [34]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Sex             891 non-null    object 
 5   Age             891 non-null    float64
 6   SibSp           891 non-null    int64  
 7   Parch           891 non-null    int64  
 8   Ticket          891 non-null    object 
 9   Fare            891 non-null    float64
 10  Cabin           204 non-null    object 
 11  Embarked        891 non-null    object 
 12  Title           891 non-null    object 
 13  PeopleInTicket  891 non-null    int64  
 14  FarePerPerson   891 non-null    float64
dtypes: float64(3), int64(6), object(6)
memory usage: 104.5+ KB


In [35]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     418 non-null    int64  
 1   Pclass          418 non-null    int64  
 2   Name            418 non-null    object 
 3   Sex             418 non-null    object 
 4   Age             418 non-null    float64
 5   SibSp           418 non-null    int64  
 6   Parch           418 non-null    int64  
 7   Ticket          418 non-null    object 
 8   Fare            418 non-null    float64
 9   Cabin           91 non-null     object 
 10  Embarked        418 non-null    object 
 11  Title           418 non-null    object 
 12  PeopleInTicket  418 non-null    int64  
 13  FarePerPerson   418 non-null    float64
dtypes: float64(3), int64(5), object(6)
memory usage: 45.8+ KB


---

In [36]:
input_dim = 4
hidden_dim = 6
output_dim = 1

In [37]:
EPOCHS = 10

In [38]:
X_data = []
X_test_data = []
Y_data = []

In [39]:
# 학습 데이터 준비
def data():
    global X_data, X_test_data, Y_data
    
    Y = df_train["Survived"]
    
    df_train.loc[df_train['Sex'] == 'male', 'Sex'] = 0
    df_train.loc[df_train['Sex'] == 'female', 'Sex'] = 1
    df_train['Sex'] = pd.to_numeric(df_train['Sex'])
    
    df_test.loc[df_test['Sex'] == 'male', 'Sex'] = 0
    df_test.loc[df_test['Sex'] == 'female', 'Sex'] = 1
    df_test['Sex'] = pd.to_numeric(df_test['Sex'])
    
    features = ['Pclass', 'Sex', 'SibSp', 'Parch']
    X = pd.get_dummies(df_train[features])
    X_test = pd.get_dummies(df_test[features])
    
    X_data = torch.from_numpy(X.to_numpy()).float()
    X_test_data = torch.from_numpy(X_test.to_numpy()).float()
    Y_data = torch.from_numpy(Y.to_numpy()).float()

In [40]:
class TitanicModel(torch.nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()
        
        self.linear1 = torch.nn.Linear(input_dim, hidden_dim)
        self.ReLU = torch.nn.ReLU()
        self.linear3 = torch.nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.linear1(x)
        x = self.ReLU(x)
        x = self.linear3(x)
        return x

In [41]:
def train_one_epoch():
    
    global X_data, Y_data
    
    for i in range(len(X_data)):
        inputs = X_data[i]
        labels = Y_data[i]
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = loss_fn(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
    return loss.item()

In [42]:
data()
model = TitanicModel()
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [43]:
X_data.type()

'torch.FloatTensor'

In [44]:
for epoch in range(EPOCHS):
    model.train(True)
    avg_loss = train_one_epoch()

In [45]:
sum_correct = 0
total_num = len(X_data)

In [46]:
for i in range(total_num):
    result = model(X_data[i])
    result = result.detach().numpy()
    result = 1 if result > 0.5 else 0
    
    if result == Y_data[i]:
        sum_correct += 1
print("Accuracy :", sum_correct/total_num)

Accuracy : 0.8058361391694725


In [47]:
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']

In [48]:
# 예측 결과를 저장할 리스트 생성
survived_results = []

In [49]:
for i in range(len(X_test_data)):
    result = model(X_test_data[i])
    result = result.detach().numpy()
    result = 1 if result > 0.5 else 0
    
    # 예측 결과를 리스트에 저장
    survived_results.append(result)

# 예측 결과를 'Survived' 컬럼에 할당
submission_df['Survived'] = survived_results

In [50]:
submission_df.to_csv('submissions.csv', header=True, index=False)
print("Result saved")

Result saved


In [51]:
submission_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
