In [493]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [478]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

## Model: Logistic Regression

### 변수 처리 방법
- PassengerID : 제거
- Name: 모델링에서 사용 X
- SIbSp : integer 로 처리
- Parch: integer 로 처리
- Ticket : 해당 컬럼 자체는 사용하지 않고 추후 group 컬럼으로 unique count 해서 따로 만듦
- Pclass : integer 처리(1,2,3 그대로 냅둠)
- Age : 연속형 변수. null 값은 mean 으로 처리
- Cabin : 앞 알파벳만 따와서 categorical 로 사용
- Fare : 단체 티켓 가격 합쳐진 것은 1인당 가격으로 수정해서 사용. 연속형 변수
- Embarked : one-hot encoding / null 값은 drop

### 생성 변수
- group : 해당 ticket 번호를 가진 사람이 2명 이상일 경우 -> group / 아니면 0


### scaling
- fare 변수는 log 처리(데이터 분포가 너무 한쪽으로 편향되어 있으므로)
- 이후 모든 변수에 대해서 z-score 로 scaling 진행. 

In [479]:
# embarked null 인 값들 제거
train = train[train['Embarked'].isnull()==False]

# target  variable 따로 떼어냄
target = train["Survived"]

# 필요없는 변수/타겟 변수 제거
train.drop(['PassengerId','Name',"Survived"],1,inplace=True)

# Age null 값 처리
train['Age'].fillna(train['Age'].mean(),inplace=True)

In [480]:
#Group 변수 생성

#Family size랑 dup_count 중 큰걸로 fare 나누기.
train['Family_size'] = train['SibSp']+train['Parch']+1
train['dup_count'] = train.groupby(['Ticket'])['Ticket'].transform('count')
train['Group'] = ~((train['Family_size'] == 1) & (train['dup_count'] == 1))
train['Group_count'] = train[['Family_size','dup_count']].max(axis=1)
train['Fare_ind'] = train['Fare']/train['Group_count']

In [481]:
train['Cabin_alpha'] = train[train['Cabin'].isnull()==False]['Cabin'].map(lambda x: x[0])

In [482]:
train.groupby(['Pclass',"Cabin_alpha"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family_size,dup_count,Group,Group_count,Fare_ind
Pclass,Cabin_alpha,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,A,15,15,15,15,15,15,15,15,15,15,15,15,15
1,B,45,45,45,45,45,45,45,45,45,45,45,45,45
1,C,59,59,59,59,59,59,59,59,59,59,59,59,59
1,D,29,29,29,29,29,29,29,29,29,29,29,29,29
1,E,25,25,25,25,25,25,25,25,25,25,25,25,25
1,T,1,1,1,1,1,1,1,1,1,1,1,1,1
2,D,4,4,4,4,4,4,4,4,4,4,4,4,4
2,E,4,4,4,4,4,4,4,4,4,4,4,4,4
2,F,8,8,8,8,8,8,8,8,8,8,8,8,8
3,E,3,3,3,3,3,3,3,3,3,3,3,3,3


In [483]:
train = train.drop(['Family_size','dup_count','Group_count',"Fare",'Ticket','Cabin'],1)

In [484]:
train['Cabin_alpha'].fillna("N",inplace=True)

In [485]:
train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Group,Fare_ind,Cabin_alpha
0,3,male,22.000000,1,0,S,True,3.625000,N
1,1,female,38.000000,1,0,C,True,35.641650,C
2,3,female,26.000000,0,0,S,False,7.925000,N
3,1,female,35.000000,1,0,S,True,26.550000,C
4,3,male,35.000000,0,0,S,False,8.050000,N
5,3,male,29.642093,0,0,Q,False,8.458300,N
6,1,male,54.000000,0,0,S,False,51.862500,E
7,3,male,2.000000,3,1,S,True,4.215000,N
8,3,female,27.000000,0,2,S,True,3.711100,N
9,2,female,14.000000,1,0,C,True,15.035400,N


In [486]:
train = pd.get_dummies(train, columns=['Sex', 'Embarked','Group','Cabin_alpha']).drop(['Sex_female','Group_False'],1)
## binary 인 sex/group 은 sex_female , group_false drop
train['Fare_ind'] = np.log1p(train["Fare_ind"])
train = train.reset_index().drop('index',1)

In [489]:
scaler = StandardScaler()
scaler.fit(train)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [490]:
lr_model = LogisticRegression(random_state=0) 
params = {'C': [0.001, 0.005, 0.01, 0.05 , 0.1, 0.5 , 1, 5, 10, 50, 100, 1000],"penalty":['l1','l2']}
gs = GridSearchCV(lr_model,cv=5, param_grid=params)
gs.fit(scaler.fit_transform(train),target)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)






GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 1000], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [491]:
gs.best_score_

0.7952755905511811

In [492]:
gs.best_params_

{'C': 0.5, 'penalty': 'l1'}