# Kaggle Playground Titanic

## **목적** 
- 프로그래머스 데이터분석 데브코스 과정에서 encoder 종류와 사용처, imputer 사용한 결측치 처리 학습한 내용 복습해보기 위해

## **사용한 기술**
- XGBoost, scikit-learn

## **성과**
- 다른 프로젝트에서 encoder 사용으로 코드 가독성 개선, 적절한 enocoder 사용
- 상위 50% 달성

In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/titanic'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [100]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
sub = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [101]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [55]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
train_df["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [102]:
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [103]:
train_df["Age"] = mean_imputer.fit_transform(train_df[["Age"]]).ravel()
test_df["Age"] = mean_imputer.fit_transform(test_df[["Age"]]).ravel()
train_df["Embarked"] = mode_imputer.fit_transform(train_df[["Embarked"]]).ravel()
test_df["Embarked"] = mode_imputer.fit_transform(test_df[["Embarked"]]).ravel()



In [104]:
enc = OneHotEncoder(sparse_output=False)
enc.fit(train_df[["Sex", "Embarked"]])


train_encoded = enc.transform(train_df[["Sex", "Embarked"]])

columns = enc.get_feature_names_out(["Sex", "Embarked"])
encoded_df = pd.DataFrame(train_encoded, columns=columns)

train_df = train_df.drop(["Sex", "Embarked"], axis=1)
train_df = pd.concat([train_df, encoded_df], axis=1)


In [105]:
enc.fit(test_df[["Sex", "Embarked"]])

test_encoded = enc.transform(test_df[["Sex", "Embarked"]])

columns = enc.get_feature_names_out(["Sex", "Embarked"])
encoded_df = pd.DataFrame(test_encoded, columns=columns)

test_df = test_df.drop(["Sex", "Embarked"], axis=1)
test_df = pd.concat([test_df, encoded_df], axis=1)

In [106]:
train_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.000000,1,0,A/5 21171,7.2500,,0.0,1.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.000000,1,0,PC 17599,71.2833,C85,1.0,0.0,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",26.000000,0,0,STON/O2. 3101282,7.9250,,1.0,0.0,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.000000,1,0,113803,53.1000,C123,1.0,0.0,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",35.000000,0,0,373450,8.0500,,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.000000,0,0,211536,13.0000,,0.0,1.0,0.0,0.0,1.0
887,888,1,1,"Graham, Miss. Margaret Edith",19.000000,0,0,112053,30.0000,B42,1.0,0.0,0.0,0.0,1.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",29.699118,1,2,W./C. 6607,23.4500,,1.0,0.0,0.0,0.0,1.0
889,890,1,1,"Behr, Mr. Karl Howell",26.000000,0,0,111369,30.0000,C148,0.0,1.0,1.0,0.0,0.0


In [107]:
ed_train_df = train_df[["Survived", "Pclass", "Age","Fare", 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
ed_test_df = test_df[["Pclass", "Age", "Fare", 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]

In [108]:
ed_train_df = ed_train_df.astype(int)
ed_test_df = ed_test_df.astype(int)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [112]:
ed_train_df.isnull().sum()

Survived      0
Pclass        0
Age           0
Fare          0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [113]:
x = ed_train_df.drop("Survived", axis =1)
y = ed_train_df["Survived"]

In [114]:
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size = 0.2)

In [None]:
"""dtrain = xgb.DMatrix(x, label=y)"""

In [None]:
"""params = {
    'max_depth': 6,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}
num_round = 100
bst = xgb.train(params, dtrain, num_round)"""

In [119]:
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.06, max_depth=16)
model.fit(X_train, y_train)

In [120]:
predictions = model.predict(X_test)
classificationReport = classification_report(y_test, predictions)
print(classificationReport)

              precision    recall  f1-score   support

           0       0.84      0.86      0.85       105
           1       0.79      0.77      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



In [60]:
#dtest = xgb.DMatrix(ed_test_df)
predictions = model.predict(ed_test_df)

In [None]:
"""cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=3, metrics='error', early_stopping_rounds=10)
print(cv_results)"""

In [None]:
sub

In [None]:
predictions

In [None]:
sub_id = sub["PassengerId"]
n_predictions = (predictions > 0.5).astype(bool)
n_predictions = n_predictions.astype(int)
output = pd.DataFrame({'PassengerId': sub_id,
                       'Survived': n_predictions.squeeze()})

In [None]:
sub['Survived'] = output["Survived"]
sub.to_csv('submission.csv', index=False)
sub.head()

In [None]:
"""classificationReport = classification_report(y_test, predictions)
print(classificationReport)"""