In [48]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split

df = sns.load_dataset('titanic')
x_train, x_test, y_train, y_test = train_test_split(df, df['survived'], test_size = 0.2, random_state = 42, stratify = df['survived'])
x_train.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
692,1,3,male,,0,0,56.4958,S,Third,man,True,,Southampton,yes,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
527,0,1,male,,0,0,221.7792,S,First,man,True,C,Southampton,no,True


In [49]:
x_train = x_train.drop(['survived','alive'], axis = 1)
x_test = x_test.drop(['survived','alive'], axis = 1)

In [50]:
# 결측치

print(x_train.isnull().sum())

pclass           0
sex              0
age            137
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           553
embark_town      2
alone            0
dtype: int64


In [51]:
print(x_test.isnull().sum())

pclass           0
sex              0
age             40
sibsp            0
parch            0
fare             0
embarked         0
class            0
who              0
adult_male       0
deck           135
embark_town      0
alone            0
dtype: int64


In [52]:
print('deck: ', x_train.deck.value_counts())
print('age: ', x_train.age.value_counts())
print('embarked: ', x_train.embarked.value_counts())
print('embark_town: ', x_train.embark_town.value_counts())

deck:  C    41
B    34
E    29
D    26
A    14
F    11
G     4
Name: deck, dtype: int64
age:  22.00    26
18.00    23
24.00    22
21.00    21
19.00    21
         ..
12.00     1
36.50     1
0.67      1
0.42      1
70.50     1
Name: age, Length: 85, dtype: int64
embarked:  S    516
C    139
Q     55
Name: embarked, dtype: int64
embark_town:  Southampton    516
Cherbourg      139
Queenstown      55
Name: embark_town, dtype: int64


In [53]:
missing = ['age']
for i in missing:
    x_train[i] = x_train[i].fillna(x_train[i].mean())
    x_test[i] = x_test[i].fillna(x_test[i].mean())
    
x_train['deck'] = x_train['deck'].fillna('C')    
x_test['deck'] = x_test['deck'].fillna('C')    

x_train['embarked'] = x_train['embarked'].fillna('S')
x_test['embarked'] = x_test['embarked'].fillna('S')

x_train['embark_town'] = x_train['embark_town'].fillna('Southampton')
x_test['embark_town'] = x_test['embark_town'].fillna('Southampton')

In [54]:
print(x_train.isnull().sum())

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alone          0
dtype: int64


In [55]:
# 라벨인코딩

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [56]:
x_train.dtypes

pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alone              bool
dtype: object

In [57]:
x_train.head(3)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
692,3,male,29.807687,0,0,56.4958,S,Third,man,True,C,Southampton,True
481,2,male,29.807687,0,0,0.0,S,Second,man,True,C,Southampton,True
527,1,male,29.807687,0,0,221.7792,S,First,man,True,C,Southampton,True


In [58]:
x_train.pclass = x_train.pclass.astype('category')
x_test.pclass = x_test.pclass.astype('category')

In [60]:
label = ['pclass','sex','embarked','class','who','adult_male','deck','embark_town','alone']
x_train[label] = x_train[label].apply(le.fit_transform)
x_test[label] = x_test[label].apply(le.fit_transform)

In [61]:
x_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
692,2,1,29.807687,0,0,56.4958,2,2,1,1,2,2,1
481,1,1,29.807687,0,0,0.0000,2,1,1,1,2,2,1
527,0,1,29.807687,0,0,221.7792,2,0,1,1,2,2,1
855,2,0,18.000000,0,1,9.3500,2,2,2,0,2,2,0
801,1,0,31.000000,1,1,26.2500,2,1,2,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,2,0,29.807687,0,0,7.8792,1,2,2,0,2,1,1
258,0,0,35.000000,0,0,512.3292,0,0,2,0,2,0,1
736,2,0,48.000000,1,3,34.3750,2,2,2,0,2,2,0
462,0,1,47.000000,0,0,38.5000,2,0,1,1,4,2,1


In [62]:
# 데이터 타입변환
x_train.dtypes

pclass           int64
sex              int64
age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int64
class            int64
who              int64
adult_male       int64
deck             int64
embark_town      int64
alone            int64
dtype: object

In [63]:
category = ['pclass','sex','class']
for i in x_train[category]:
    x_train[i] = x_train[i].astype('category')
    x_test[i] = x_test[i].astype('category')

In [64]:
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)
x_train.head(3)

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_0,pclass_1,pclass_2,sex_0,sex_1,class_0,class_1,class_2
692,29.807687,0,0,56.4958,2,1,1,2,2,1,0,0,1,0,1,0,0,1
481,29.807687,0,0,0.0,2,1,1,2,2,1,0,1,0,0,1,0,1,0
527,29.807687,0,0,221.7792,2,1,1,2,2,1,1,0,0,0,1,1,0,0


In [65]:
# 파생변수
x_train['age_qcut'] = pd.qcut(x_train['age'], 5, labels = False)
x_test['age_qcut'] = pd.qcut(x_test['age'], 5, labels = False)

In [66]:
x_train['ages'] = x_train['age'] // 10 * 10
x_test['ages'] = x_test['age'] // 10 * 10

In [67]:
x_train.dtypes

age            float64
sibsp            int64
parch            int64
fare           float64
embarked         int64
who              int64
adult_male       int64
deck             int64
embark_town      int64
alone            int64
pclass_0         uint8
pclass_1         uint8
pclass_2         uint8
sex_0            uint8
sex_1            uint8
class_0          uint8
class_1          uint8
class_2          uint8
age_qcut         int64
ages           float64
dtype: object

In [68]:
# 5.스케일링
from sklearn.preprocessing import MinMaxScaler
scale = ['age','fare']
mm = MinMaxScaler()

In [69]:
mm.fit(x_train[scale])
x_train[scale] = mm.transform(x_train[scale])
x_test[scale] = mm.transform(x_test[scale])

In [70]:
x_train.head(3)

Unnamed: 0,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alone,pclass_0,pclass_1,pclass_2,sex_0,sex_1,class_0,class_1,class_2,age_qcut,ages
692,0.369285,0,0,0.110272,2,1,1,2,2,1,0,0,1,0,1,0,0,1,2,20.0
481,0.369285,0,0,0.0,2,1,1,2,2,1,0,1,0,0,1,0,1,0,2,20.0
527,0.369285,0,0,0.432884,2,1,1,2,2,1,1,0,0,0,1,1,0,0,2,20.0


In [71]:
# 6.데이터 분리
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state= 42, stratify = y_train)

In [72]:
x_tr.shape, x_val.shape

((569, 20), (143, 20))

In [80]:
# 모형 학습
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

model1 = LogisticRegression()
model1.fit(x_tr, y_tr)
pred1 = pd.DataFrame(model1.predict_proba(x_val))

model2 = RandomForestClassifier()
model2.fit(x_tr, y_tr)
pred2 = pd.DataFrame(model2.predict_proba(x_val))

model3 = VotingClassifier(estimators = [('logistic', model1), ('randomF', model2)], voting = 'soft')
model3.fit(x_tr, y_tr)
pred3 = pd.DataFrame(model3.predict_proba(x_val))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [79]:
pred1

Unnamed: 0,0,1
0,0.061208,0.938792
1,0.634535,0.365465
2,0.213157,0.786843
3,0.915647,0.084353
4,0.833964,0.166036
...,...,...
138,0.821550,0.178450
139,0.857820,0.142180
140,0.816195,0.183805
141,0.054683,0.945317


In [81]:
# 모형평가

from sklearn.metrics import roc_auc_score
print('로지스틱: ',roc_auc_score(y_val, pred1.iloc[:,1]))
print('랜포: ',roc_auc_score(y_val, pred2.iloc[:,1]))
print('보팅: ',roc_auc_score(y_val, pred3.iloc[:,1]))

로지스틱:  0.8631198347107438
랜포:  0.8525826446280991
보팅:  0.8589876033057852


In [84]:
# 하이퍼파라미터 튜닝

from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators': [50, 100], 'max_depth': [4,6]}
model5 = RandomForestClassifier()
gscv = GridSearchCV(estimator = model5, param_grid = parameters, cv = 3)
gscv.fit(x_tr, y_tr)
print("최적 파라미터: ", gscv.best_params_)

최적 파라미터:  {'max_depth': 4, 'n_estimators': 50}


In [87]:
# 파일 저장

result = pd.DataFrame(model3.predict_proba(x_test))
result = result.iloc[:, 1]
pd.DataFrame({'id': x_test.index, 'result': result}).to_csv("00000.csv", index = False)

In [88]:
pd.read_csv('00000.csv').head(3)

Unnamed: 0,id,result
0,565,0.074286
1,160,0.082565
2,553,0.064032
