### 6.

### 1. 데이터 읽어오기

In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')      
submission = pd.read_csv('sample_submission.csv')

### 2. 데이터 확인

### 3. 데이터 전처리

In [None]:
columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train = train[columns + ['Survived']]
test = test[columns]

# Null 처리
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 이상치 제거
train = train[train['Parch'] <= 5]
train = train[train['Fare'] <= 300]

train['Sex'] = train['Sex'].apply(lambda x: 0 if x == 'female' else 1)
test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'female' else 1)

train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

train_x = train.drop(columns='Survived', axis=1)
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

from imblearn.over_sampling import SMOTE

# 모델설정
smote = SMOTE(random_state=0)

# train데이터를 넣어 복제함
X_resampled, y_resampled = smote.fit_resample(train_x,list(train_y))

X_resampled['Survived'] = y_resampled
train_dataset = X_resampled

### 4. 데이터 학습

In [None]:
import statsmodels.api as sm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

formula = """
Survived ~ C(Pclass)+ C(Sex) + scale(Age) + scale(SibSp) + scale(Parch) + scale(Fare) + C(Embarked_Q)+ C(Embarked_S)
"""

model = sm.Logit.from_formula(formula, data=train_x)
result = model.fit()
y_pred = result.predict(val_x)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

In [None]:
from sklearn.tree import DecisionTreeClassifier

X_resampled = X_resampled.drop(columns='Survived', axis=1)
model = DecisionTreeClassifier(max_depth=6, random_state=0)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model.fit(train.drop(columns='Survived'),train['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report (val_y, y_pred))

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(val_x)

print(confusion_matrix(val_y, y_pred))
print(classification_report (val_y, y_pred))

In [None]:
my_model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
my_model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])
XGB_pred = my_model.predict(test)

### 5. CSV파일로 저장

In [None]:
y_pred = my_model.predict(test)
submission['Survived'] = y_pred
submission['Survived'] = submission['Survived'].apply(lambda x: 1 if x >= 0.5 else 0)
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=False)