In [1]:
!pip install pandas>=1.3

In [2]:
import pandas as pd
titatnic_df = pd.read_csv('titanic3.csv')

# 타이타닉 데이터셋 불러오기

In [3]:
titatnic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# 전처리: 결측값 처리, 불필요 및 중복성 컬럼 제거, 파생 변수 생성, encoding

## 불필요한 컬럼 삭제

- body는 시체 번호, boat 는 생존자가 탑승했던 배 번호라서  종속변수성이 포함되어있기 때문에 제거 
- ticket 는 티켓 고유번호이기 때문에 제거
- cabin와 home.dest는 Null값이 너무 많아 제거

In [4]:
titatnic_df = titatnic_df.drop(['body','boat','cabin','ticket','home.dest'], axis=1)
titatnic_df.tail(50)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,embarked
1259,3,0,"Turcin, Mr. Stjepan",male,36.0,0,0,7.8958,S
1260,3,1,"Turja, Miss. Anna Sofia",female,18.0,0,0,9.8417,S
1261,3,1,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,9.5875,S
1262,3,0,"van Billiard, Master. James William",male,,1,1,14.5,S
1263,3,0,"van Billiard, Master. Walter John",male,11.5,1,1,14.5,S
1264,3,0,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,14.5,S
1265,3,0,"Van Impe, Miss. Catharina",female,10.0,0,2,24.15,S
1266,3,0,"Van Impe, Mr. Jean Baptiste",male,36.0,1,1,24.15,S
1267,3,0,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,24.15,S
1268,3,0,"van Melkebeke, Mr. Philemon",male,,0,0,9.5,S


## Null값 교체

### embarked 

In [5]:
titatnic_df['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [6]:
titatnic_df.embarked.fillna('S', inplace=True)

### null값을 등급별 평균 나이/ 요금으로 대체

그룹 관련 해서 

In [7]:
import warnings
warnings.filterwarnings('ignore')

grouped= titatnic_df.groupby('pclass')

class1 = grouped.get_group(1)
class2 = grouped.get_group(2)
class3 = grouped.get_group(3)

class1.age.fillna(class1.age.mean(), inplace=True)
class1.fare.fillna(class1.fare.mean(), inplace=True)

class2.age.fillna(class2.age.mean(), inplace=True)
class2.fare.fillna(class2.fare.mean(), inplace=True)

class3.age.fillna(class3.age.mean(), inplace=True)
class3.fare.fillna(class3.fare.mean(), inplace=True)

titatnic_df = pd.concat([class1,class2,class3], axis=0)
titatnic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.3375,S
1,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,151.55,S
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,S


## 나이

In [8]:
def age_class(age):
    cat= 'cat'
    if age <= 5: 
        cat = 'Baby'
    if 5 < age <= 12: 
        cat = 'Child'
    if 12 < age <= 19: 
        cat = 'Teenager'
    if 19< age <= 25: 
        cat = 'YoungAdult'
    if 25< age <= 60: 
        cat = 'Adult'
    if age > 60 : 
        cat = 'Elderly'
        
    return cat

titatnic_df['age'] = titatnic_df.age.apply(age_class)

In [9]:
pd.pivot_table(titatnic_df, 
                 index= "age" ,
                 columns= "survived"  ,
                 values= "sex"    ,
                 aggfunc= "count" ).plot.bar()

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

# 이름에서 타이틀 추출  

In [None]:
import re 

def title(x):

    pattern = re.compile(r',\s(\w+).')
    title = pattern.search(x)
    
    return title.group(1)

titatnic_df["title"] = titatnic_df.name.apply(lambda x: title(x) )

In [None]:
titatnic_df.title.unique()

In [None]:
grouped_by_title = titatnic_df.groupby("title")
group_list = list(grouped_by_title)

In [None]:
title_list = list(titatnic_df.title.unique())

In [None]:
import matplotlib as plt

for title in title_list:
    group_df = grouped_by_title.get_group(title)
    df = pd.pivot_table(group_df, 
                 index= "pclass" ,
                 columns= "survived"  ,
                 values= "name"    ,
                 aggfunc= "count" )

    df.plot.bar(title=title)

인사이트: 
- 여성 닥터는 같이 탑승했던 가족이 없고 적은 가격을 냈었어도 생존할 수 있었다.
- Master이고 1등급과 2등급에 있었던 모든 사람들은 생존했다. 3등급은 매우 생존여부가 다양하다.  

범주화를 이렇게 해보기:
- Miss : 연령으로만 타이타닉 데이터셋을 분석해봐도 여자가 생존율이 높다는 것을 알지만, 1과 2등급에서는 3등급에 비해 생존율이 훨씬 더 높다.
- Master: Master이고 1등급과 2등급에 있었던 모든 사람들은 생존했다. 3등급은 생존여부가 훨씬 낮다.
- Mr: 생존율이 모든 등급에서 매우 낮다. 
- Mrs: 1등급에서는 대부분 생존했다. 등급이 낮아질수록 생존율이 약간씩 낮아졌다.
나머지들은 수가 너무 낮아 어떠한 패턴이 보이지 않으니 묶어놓는다.
- Other

In [None]:
titatnic_df = titatnic_df.drop(columns=['name'])

In [None]:
def title_class(title):
    cat= title
    if title not in ['Miss','Master','Mr','Mrs']: 
        cat = 'Other'
        
    return cat

titatnic_df['title'] = titatnic_df.title.apply(lambda x : title_class(x) )


In [None]:
titatnic_df.head()

In [None]:
titatnic_df['family'] = titatnic_df['sibsp']+titatnic_df['parch']
titatnic_df = titatnic_df.drop(['sibsp','parch'], axis=1)

In [None]:
titatnic_df

In [None]:
dff = titatnic_df.copy()
dff['pclass'] = dff['pclass'].astype('str')
dff['sex'] = dff['sex'].astype('str')
dff['embarked'] = dff['embarked'].astype('str')
dff['age'] = dff['age'].astype('str')
dff['title'] = dff['title'].astype('str')
#dff['fare'] = dff['fare'].astype('str')

df2 = dff[['pclass','sex','embarked','age','title']]
df2 = pd.get_dummies(df2)

In [None]:
df3 = pd.concat([titatnic_df['family'],df2],axis=1)

In [None]:
df3.columns

In [None]:
titatnic_df.head()

In [None]:
df3.head()

In [None]:
# 전처리: 결측값 처리, 불필요 및 중복성 컬럼 제거, 파생 변수 생성, encoding
# boat 나  body는 종속변수의 특성을 가지고 있다 (종속변수성)

In [None]:
from sklearn.model_selection import train_test_split

y_tdf = titatnic_df['survived']
X_tdf = df3

X_train, X_test, y_train, y_test = train_test_split(X_tdf,y_tdf, test_size=0.20, random_state=21323)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test,rf_pred).round(2)

lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
lr_pred = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test,lr_pred).round(2)

print(f'rf 정확도:{accuracy_rf}, lr 정확도:{accuracy_lr}')

In [None]:
import pickle
import joblib

# 객체 
filename = 'tcl_model.pkl'
joblib.dump(lr_model,filename)

In [None]:
mdl = joblib.load('tcl_model.pkl')
mdl

In [None]:
# 새로운 값을 넣어서 예측값 받아보기 

data = [3, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 0,0,0,0, 0,
       0, 0, 0, 1, 0]
df= pd.DataFrame(columns=['family', 'pclass_1', 'pclass_2', 'pclass_3', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S', 'age_Adult', 'age_Baby',
       'age_Child', 'age_Elderly', 'age_YoungAdult', 'age_Teenager',
       'title_Master', 'title_Miss', 'title_Mr', 'title_Mrs', 'title_Other'])
df.loc[0,:] = data
y_pred = mdl.predict(df)
y_pred