# Logistic Regression Model

## 변수 처리 방법
- Passenger ID : 제거
- Pclass : Ordinal 변수이므로 그대로 채용.
- Sex : Label-Encode
- Age : 그대로 (Mean으로 결측값 Impute)
- Sibsp, Parch : 그대로
- Ticket : 제거
- Fare : 그대로, 단 Ticket변수와 Sibsp+parch 사용하여 그룹멤버들만큼 나눠서 1인당 요금으로 계산
- Cabin : Binary로 변환
- Embarked : one-hot encoding with NaN imputed as 'C'. (NaN승객들과 같은 Pclass 승객들의 Fare mean으로 C인것을 유추)

## 생성변수
- Group : Ticket 변수가 unique하지 않거나 Sibsp+parch > 0 일때 1, 아닐때 0을 가지는 단체승객여부 변수.

In [202]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [185]:
# Import Dataset
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [186]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


- 변수 전처리

In [187]:
#결측치 처리 (Age, Embarked)
df_train['Age'].fillna(df_train['Age'].mean(),inplace=True)
df_test['Age'].fillna(df_train['Age'].mean(),inplace=True)
df_train['Embarked'].fillna('C',inplace=True)

#Cabin 결측치 처리 겸 변수 변환
df_train['Cabin'] = df_train.Cabin.apply(lambda x: 1 if pd.notnull(x) else 0)
df_test['Cabin'] = df_test.Cabin.apply(lambda x: 1 if pd.notnull(x) else 0)

In [188]:
#Group 변수 생성
#Family size랑 dup_count 중 큰걸로 fare 나누기.
df_train['Family_size'] = df_train['SibSp']+df_train['Parch']+1
df_train['dup_count'] = df_train.groupby(['Ticket'])['Ticket'].transform('count')
df_train['Group'] = ~((df_train['Family_size'] == 1) & (df_train['dup_count'] == 1))
df_train['Group_count'] = df_train[['Family_size','dup_count']].max(axis=1)
df_train['Fare_ind'] = df_train['Fare']/df_train['Group_count']

In [189]:
#Fare 결측치 처리 in test
df_test['Fare'].fillna(df_test['Fare'].mean(),inplace=True)

df_test['Family_size'] = df_test['SibSp']+df_test['Parch']+1
df_test['dup_count'] = df_test.groupby(['Ticket'])['Ticket'].transform('count')
df_test['Group'] = ~((df_test['Family_size'] == 1) & (df_test['dup_count'] == 1))
df_test['Group_count'] = df_test[['Family_size','dup_count']].max(axis=1)
df_test['Fare_ind'] = df_test['Fare']/df_test['Group_count']

In [190]:
# Categorical Feature Encoding
df_train['Sex'] = df_train['Sex'].apply(lambda x: 1 if x is 'male' else 0)
df_test['Sex'] = df_test['Sex'].apply(lambda x: 1 if x is 'male' else 0)
df_train = pd.get_dummies(df_train, columns = ['Embarked'],drop_first=True,prefix='', prefix_sep='')
df_test = pd.get_dummies(df_test, columns = ['Embarked'],drop_first=True,prefix='', prefix_sep='')

### 모델링

In [200]:
#모델에 사용하지 않을 변수 제거
train_X = df_train.drop(['PassengerId','Survived','Name','Ticket','Fare','Cabin','Family_size','dup_count','Group_count'],axis=1)
test_X = df_test.drop(['PassengerId','Name','Ticket','Fare','Cabin','Family_size','dup_count','Group_count'],axis=1)
train_y = df_train.Survived

In [213]:
lr = LogisticRegression(random_state=0,max_iter = 500)
parameters = {'solver':('liblinear','lbfgs'),'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
clf = GridSearchCV(lr, parameters, cv=5)
clf.fit(train_X, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=500, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'solver': ('liblinear', 'lbfgs')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [205]:
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_solver',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [216]:
print(clf.best_score_)
print(clf.best_params_)

0.7115600448933782
{'C': 0.1, 'solver': 'lbfgs'}
