In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

df = pd.read_csv('data/train.csv')
df.info()

df_copy = df.copy()
cond1 = df_copy['Sex'] == 'female'
cond2 = df_copy['Sex'] == 'male'
df_copy.loc[cond1, 'Age'] = df_copy.loc[cond1, 'Age'].fillna(df_copy.loc[cond1, 'Age'].mean())
df_copy.loc[cond2, 'Age'] = df_copy.loc[cond2, 'Age'].fillna(df_copy.loc[cond2, 'Age'].mean())

df = df_copy
df = pd.get_dummies(df, columns=['Sex'], dtype=int)

data = df[['PassengerId', 'Pclass', 'Sex_female', 'Age', 'SibSp', 'Parch', 'Fare']]
target = df[['Survived']]

train_input, test_input, train_target, test_target = train_test_split(data, target)

# 모델 훈련 - KNN
knr = KNeighborsRegressor()
knr.fit(train_input, train_target)

# 모델 훈련 - Linear Regression
lr = LinearRegression()
lr.fit(train_input, train_target)
print("Linear Regression Test Score:", lr.score(test_input, test_target))

# 다항 특성 생성
poly = PolynomialFeatures(degree=2, include_bias=False)
train_poly = poly.fit_transform(train_input)
test_poly = poly.transform(test_input)

# 모델 훈련 - Linear Regression (다항 특성)
lr = LinearRegression()
lr.fit(train_poly, train_target)

# 데이터 스케일링
ss = StandardScaler()
train_scaled = ss.fit_transform(train_poly)
test_scaled = ss.transform(test_poly)

# 테스트 데이터 로드
sub_df = pd.read_csv('data/test.csv')
sub_df_copy = sub_df.copy()

# 테스트 데이터의 결측치 처리
cond1 = sub_df_copy['Sex'] == 'female'
cond2 = sub_df_copy['Sex'] == 'male'
sub_df_copy.loc[cond1, 'Age'] = sub_df_copy.loc[cond1, 'Age'].fillna(sub_df_copy.loc[cond1, 'Age'].mean())
sub_df_copy.loc[cond2, 'Age'] = sub_df_copy.loc[cond2, 'Age'].fillna(sub_df_copy.loc[cond2, 'Age'].mean())

sub_df_copy.loc[:, 'Age'] = sub_df_copy['Age'].fillna(sub_df_copy['Age'].mean())
sub_df_copy.loc[:, 'Fare'] = sub_df_copy['Fare'].fillna(sub_df_copy['Fare'].mean())

sub_df = pd.get_dummies(sub_df_copy, columns=['Sex'], dtype=int)

sub_data = sub_df[['PassengerId', 'Pclass', 'Sex_female', 'Age', 'SibSp', 'Parch', 'Fare']]

# 다항 특성 및 스케일링
sub_poly = poly.transform(sub_data)
sub_scaled = ss.transform(sub_poly)

# 랜덤 포레스트 모델 훈련 및 예측
rf = RandomForestClassifier()
rf.fit(train_scaled, train_target.values.ravel())
sub_predictions = rf.predict(sub_scaled)

# 결과 저장
sub_df_copy.insert(1, 'Survived', sub_predictions)
sub_df_copy[['PassengerId', 'Survived']].to_csv('data/submission.csv', index=False)
print("예측 결과가 'submission.csv'에 저장되었습니다.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Linear Regression Test Score: 0.4340583450704011
예측 결과가 'submission.csv'에 저장되었습니다.
