In [None]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

submission.to_csv('submission.csv', index=False)

# train 행과 열의 수 확인
len(train)
train.columns

train.info()
train.describe()

# 랜덤으로 생존 여부 생성
import numpy as np

np.random.seed(0)
survived = np.random.randint(0, 2, size=len(submission['Survived']))
print(survived)

# 랜덤 값 채우기
submission['Survived'] = survived
submission.head(20)
submission.to_csv('submission.csv', index=False)

# 원본 데이터 소실, 변형 막기 위한 복사본 만들기
train = raw_data_train.copy()
test = raw_data_test.copy()
submission = raw_data_submission.copy()

# 평균값으로 결측치 대체
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

train['Age'] = train['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)

test['Age'] = test['Age'].fillna(mean_age)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 8:2로 학습/검증 데이터 분리
train_x = train.drop(columns='Survived')
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

# 로지스틱 회귀 모델 정의 및 학습
import statsmodels.api as sm
train_dataset = pd.concat([train_x, train_y], axis=1)

formula = """
Survived ~ Age + SibSp + Parch + Fare
"""

model = sm.Logit.from_formula(formula, data=train_dataset)
result = model.fit()

result.summary()

y_pred = result.predict(val_x)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_y, y_pred))

from sklearn.metrics import classification_report
print(classification_report(val_y, y_pred))

y_pred = result.predict(test)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

submission['Survived'] = y_pred
submission.head(15)

# 종속변수 시각화
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(5,5))
sns.countplot(x=raw_data_train['Survived'])
plt.show()

# 독립변수 시각화
columns = ['Pclass', 'Sex', 'Embarked']
for col_name in columns:
    fig, ax = plt.subplots(ncols=2, figsize=(10,5))
    sns.countplot(x=raw_data_train[col_name], palette='Set2', ax=ax[0]).set(title=col_name+' count plot')
    sns.barplot(data=raw_data_train, x=col_name, y="Survived", palette='Set2', ax=ax[1]).set(title=col_name+' bar chart')
    plt.show()

raw_data_train['Survived_str'] = raw_data_train['Survived'].apply(lambda x: 'deth' if x == 0 else 'Survive')
columns = ['Age', 'SibSp', 'Parch', 'Fare']
for col_name in columns:
    fig, ax = plt.subplots(ncols=2, figsize=(13,5))
    sns.boxplot(x=raw_data_train[col_name], ax=ax[0], palette='Set2')
    sns.boxplot(data=raw_data_train, x=col_name, y="Survived_str", ax=ax[1], palette='Set2')
    plt.show()
# 결측치 대체
train = raw_data_train.copy()
test = raw_data_test.copy()
submission = raw_data_submission.copy()

mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

# ‘Age’ ‘Fare’ feature의 Null 값을 각 feature의 평균(mean)값으로 대체
train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 이상치 제거
train = train[train['Parch'] <= 5]
train = train[train['Fare'] <= 300]

train_x = train.drop(columns='Survived')
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

import statsmodels.api as sm

train_dataset = train_x.copy()
train_dataset['Survived'] = train_y

formula = """
Survived ~ C(Pclass)+ C(Sex) + scale(Age) + scale(SibSp) + scale(Parch) + scale(Fare) + C(Embarked)
"""
model = sm.Logit.from_formula(formula, data=train_dataset)  
result = model.fit()  

print(result.summary())

y_pred = result.predict(val_x)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_y, y_pred))

from sklearn.metrics import classification_report
print(classification_report(val_y, y_pred))

y_pred = result.predict(test)
y_pred = y_pred.apply(lambda x: 1 if x >=0.5 else 0)

submission['Survived'] = y_pred
submission.head(15)

# 원본 데이터 복사
train = raw_data_train.copy()
test = raw_data_test.copy()
submission = raw_data_submission.copy()

columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train = train[columns + ['Survived']]
test = test[columns]

# Null 처리
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

train.loc[:, 'Age'] = train['Age'].fillna(mean_age)
test.loc[:, 'Age'] = test['Age'].fillna(mean_age)
train.loc[:, 'Fare'] = train['Fare'].fillna(mean_fare)
test.loc[:, 'Fare'] = test['Fare'].fillna(mean_fare)

# 이상치 제거
train = train[train['Parch'] <= 5]       
train = train[train['Fare'] <= 300] 

train['Sex'] = train['Sex'].apply(lambda x: 0 if x == 'female' else 1)
test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'female' else 1)

train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

train_x = train.drop(columns=['Survived'])
train_y = train[['Survived']]

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

# 데이터 불균형 해소를 위한 데이터 증강
### 소수 데이터와 기존 데이터 사이에 가상 샘플 생성, 소수 클래스가 의미있는 패턴을 가지고 클래스 불균형이 심할때 (1:10) 사용 필요
### But 분류 전용으롯 사용되어야 하고, train에만 적용되어야함
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(train_x,train_y)

X_resampled['Survived'] = y_resampled
train_dataset = X_resampled

# 데이터를 질문으로 계속 나누는 분류 모델인 결정트리분류
### max_depth는 보통 3~8 사이가 안정적, 과적합 나기 쉬움 3~5(데이터 ~1000), 4~6(데이터 ~10000), 6~8(데이터 100000 이상)
### min_samples_leaf로 하나의 최종 규칙이 최소 몇 개 데이터를 가져야 하는지를 다룸 20~50(데이터 ~1000), 50~100(데이터 ~10000), 100~300(데이터 100000 이상)
### min_samples_split으로 질문을 허용할 최소 조건 (leaf보다 항상 크거나 같아야함) 50~100(데이터 ~1000), 100~200(데이터 ~10000), 200~500(데이터 100000 이상)
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=6, random_state=0)
model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])

y_pred = model.predict(val_x)

from sklearn.metrics import confusion_matrix
print(confusion_matrix(val_y, y_pred))

from sklearn.metrics import classification_report
print(classification_report(val_y, y_pred))

y_pred = model.predict(test)  
submission['Survived'] = y_pred
submission['Survived'] = submission['Survived'].apply(lambda x:1 if x >= 0.5 else 0)  
submission.head(15)

#------------------한 번에 돌리기---------------
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

columns = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
train = train[columns + ['Survived']]
test = test[columns]

# Null 처리
mean_age = train['Age'].mean()
mean_fare = train['Fare'].mean()

train['Age'] = train['Age'].fillna(mean_age)
test['Age'] = test['Age'].fillna(mean_age)
train['Fare'] = train['Fare'].fillna(mean_fare)
test['Fare'] = test['Fare'].fillna(mean_fare)

# 이상치 제거
train = train[train['Parch'] <= 5]
train = train[train['Fare'] <= 300]

train['Sex'] = train['Sex'].apply(lambda x: 0 if x == 'female' else 1)
test['Sex'] = test['Sex'].apply(lambda x: 0 if x == 'female' else 1)

train = pd.get_dummies(train, columns=['Embarked'], drop_first=True)
test = pd.get_dummies(test, columns=['Embarked'], drop_first=True)

train_x = train.drop(columns=['Survived'])
train_y = train['Survived']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y  = train_test_split(train_x, train_y, test_size=0.2, random_state=0)

from imblearn.over_sampling import SMOTE

# 모델설정
smote = SMOTE(random_state=0)

# train데이터를 넣어 복제함
X_resampled, y_resampled = smote.fit_resample(train_x, train_y)

X_resampled['Survived'] = y_resampled
train_dataset = X_resampled

import statsmodels.api as sm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

formula = """
Survived ~ C(Pclass)+ C(Sex) + scale(Age) + scale(SibSp) + scale(Parch) + scale(Fare) + C(Embarked_Q)+ C(Embarked_S)
"""

model = sm.Logit.from_formula(formula, data=train_dataset)
result = model.fit()
y_pred = result.predict(val_x)
y_pred = y_pred.apply(lambda x: 1 if x >= 0.5 else 0)

print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=6, random_state=0)
model.fit(train_x, train_y)

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

# 의사결정나무 여러개를 사용하여 과적합 방지에 탁월하나 속도/메모리 부담 있음
### n_estimators는 트리 개수로 100(빠른 베이스라인), 200~300(안정적), 500 이상(성능 미미하고 시간만 늘어남)
### max_depth 6~10, 나무가 여러 개라 너무 과한 설정 필요없음
### min_samples_leaf, 20~100 데이터 많을수록 키우기
### max_feathers, 랜덤성 핵심으로 "sqrt"를 거의 쓰면되고, 0.7 등으로 수치 바꿔보며 결과보기
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model.fit(train.drop(columns='Survived'),train['Survived'])

y_pred = model.predict(val_x)
print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

from xgboost import XGBClassifier
model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
model.fit(train_x,train_y)
y_pred = model.predict(val_x)

print(confusion_matrix(val_y, y_pred))
print(classification_report(val_y, y_pred))

my_model = XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=5, random_state = 0)
my_model.fit(train_dataset.drop(columns='Survived'),train_dataset['Survived'])
XGB_pred = my_model.predict(test)

y_pred = my_model.predict(test)
submission['Survived'] = y_pred
submission.head(10)

submission.to_csv('submission.csv', index=False)