### 4. 

### 1. 데이터 읽어오기

In [None]:
import pandas as pd

raw_data_train = pd.read_csv('train.csv')
raw_data_test = pd.read_csv('test.csv')
raw_data_submission = pd.read_csv('sample_submission.csv')

### 2. 데이터 확인

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(5,5))
sns.countplot(x=raw_data_train['Survived'])
plt.show()

### 3. 데이터 전처리

In [None]:
columns = ['Pclass', 'Sex', 'Embarked']
for col_name in columns:
    fig, ax = plt.subplots(ncols=2, figsize=(10,5))
    sns.countplot(x=raw_data_train[col_name], palette='Set2', ax=ax[0]).set(title=col_name+' count plot')
    sns.barplot(data=raw_data_train, x=col_name, y="Survived", palette='Set2', ax=ax[1]).set(title=col_name+' bar chart')
    plt.show()

In [None]:
raw_data_train['Survived_str'] = raw_data_train['Survived'].apply(lambda x: 'deth' if x == 0 else 'Survive')
columns = ['Age', 'SibSp', 'Parch', 'Fare']
for col_name in columns:
    fig, ax = plt.subplots(ncols=2, figsize=(13,5))
    sns.boxplot(x=raw_data_train[col_name], ax=ax[0], palette='Set2')
    sns.boxplot(data=raw_data_train, x=col_name, y="Survived_str", ax=ax[1], palette='Set2')
    plt.show()

In [None]:
train = raw_data_train.copy()
test = raw_data_test.copy()
submission = raw_data_submission.copy()

mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

# ‘Age’ ‘Fare’ feature의 Null 값을 각 feature의 평균(mean)값으로 대체
train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

In [None]:
train = train[train['Parch'] <= 5]
train = train[train['Fare'] <= 300]

### 4. 데이터 학습

In [None]:
train_x = train.drop(columns='Survived')
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

import statsmodels.api as sm

train_dataset = train_x.copy()
train_dataset['Survived'] = train_y

formula = """
Survived ~ C(Pclass)+ C(Sex) + scale(Age) + scale(SibSp) + scale(Parch) + scale(Fare) + C(Embarked)
"""
model = sm.Logit.from_formula(formula, data=train_dataset)  
result = model.fit()  

print(result.summary())

In [None]:
y_pred = result.predict(val_x)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_y, y_pred))

from sklearn.metrics import classification_report
print(classification_report(val_y, y_pred))

### 5. CSV파일로 저장

In [None]:
y_pred = result.predict(test)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

submission['Survived'] = y_pred
submission.head(15)

In [None]:
submission.to_csv('submission.csv', index=False)