In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('data/train.csv')
print(df_train.shape)
df_train.head()

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_test = pd.read_csv('data/test.csv')
print(df_test.shape)
df_test.head()

(418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 1. 特征处理

可能有用的特征：

+ Pclass  ticket class 可以处理成 1-of-K 的形式，或者直接使用
+ Sex. 可以处理成0-1
+ Age  直接使用，需要处理空值
+ SibSp. 是否有兄弟姐妹或配偶在船上。
+ Parch。是否有父母或孩子在船上，
+ Fare。似乎可以直接使用
+ cabin
+ Embarked。在哪里上的船 可以处理成 1-of-K的形式

In [4]:
feature = 'Pclass'
print(set(df_train[feature]))
print(set(df_test[feature]))

{1, 2, 3}
{1, 2, 3}


In [8]:
df_train['Pclass-1'] = df_train['Pclass'].map(lambda x: 1 if x==1 else 0)
df_train['Pclass-2'] = df_train['Pclass'].map(lambda x: 1 if x==2 else 0)
df_train['Pclass-3'] = df_train['Pclass'].map(lambda x: 1 if x==3 else 0)

df_test['Pclass-1'] = df_test['Pclass'].map(lambda x: 1 if x==1 else 0)
df_test['Pclass-2'] = df_test['Pclass'].map(lambda x: 1 if x==2 else 0)
df_test['Pclass-3'] = df_test['Pclass'].map(lambda x: 1 if x==3 else 0)

print(df_train.shape)
# df_train.head()

(891, 16)


In [6]:
feature = 'Sex'
print(set(df_train[feature]))
print(set(df_test[feature]))

{'male', 'female'}
{'male', 'female'}


In [9]:
df_train['is_male'] = df_train['Sex'].map(lambda x: 1 if x=='male' else 0)
df_test['is_male'] = df_test['Sex'].map(lambda x: 1 if x=='male' else 0)

print(df_train.shape)
# df_train.head()

(891, 16)


In [14]:
print(sum(df_train['Age'].isna()))
df_train['Age'].describe()
print(sum(df_test['Age'].isna()))
df_test['Age'].describe()

# Age 主要要处理空值
df_train['Age'] = df_train['Age'].fillna(0)
df_test['Age'] = df_train['Age'].fillna(0)

177
86


In [17]:
feature = 'SibSp'
print(set(df_train[feature]))
print(set(df_test[feature]))

feature = 'Parch'
print(set(df_train[feature]))
print(set(df_test[feature]))

{0, 1, 2, 3, 4, 5, 8}
{0, 1, 2, 3, 4, 5, 8}
{0, 1, 2, 3, 4, 5, 6}
{0, 1, 2, 3, 4, 5, 6, 9}


In [18]:
sibsp_df_train = pd.get_dummies(df_train['SibSp'], prefix='sibsp_')
sibsp_df_test = pd.get_dummies(df_test['SibSp'], prefix='sibsp_')

df_train = pd.concat([df_train, sibsp_df_train], axis=1)
df_test = pd.concat([df_test, sibsp_df_test], axis=1)

print(df_train.shape)

(891, 23)


In [19]:
parch_train = pd.get_dummies(df_train['Parch'], prefix='parch_')
parch_test = pd.get_dummies(df_test['Parch'], prefix='parch_')

df_train = pd.concat([df_train, parch_train], axis=1)
df_test = pd.concat([df_test, parch_test], axis=1)

print(df_train.shape)

(891, 30)


In [20]:
print(df_test.shape)
# 测试集里多一个取值可能，但是训练集里面没有，所以这个特征起不了什么作用

(418, 30)


In [23]:
print(sum(df_train['Fare'].isna()))
print(sum(df_test['Fare'].isna()))
df_test['Fare'].fillna(0)

0
1


In [25]:
# 上船的地点
print(set(df_train['Embarked']))
print(set(df_test['Embarked']))

df_train['Embarked'].fillna('N')
df_embark_train = pd.get_dummies(df_train['Embarked'], prefix='embarked_')
df_embark_test = pd.get_dummies(df_test['Embarked'], prefix='embarked_')

df_train = pd.concat([df_train, df_embark_train], axis=1)
df_test = pd.concat([df_test, df_embark_test], axis=1)
df_test['embarked_N'] = 0

print(df_test.shape)
df_test.head()

{nan, 'Q', 'S', 'C'}
{'Q', 'S', 'C'}
(418, 34)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,...,parch__2,parch__3,parch__4,parch__5,parch__6,parch__9,embarked__C,embarked__Q,embarked__S,embarked_N
0,892,3,"Kelly, Mr. James",male,22.0,0,0,330911,7.8292,,...,0,0,0,0,0,0,0,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,38.0,1,0,363272,7.0,,...,0,0,0,0,0,0,0,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",male,26.0,0,0,240276,9.6875,,...,0,0,0,0,0,0,0,1,0,0
3,895,3,"Wirz, Mr. Albert",male,35.0,0,0,315154,8.6625,,...,0,0,0,0,0,0,0,0,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,35.0,1,1,3101298,12.2875,,...,0,0,0,0,0,0,0,0,1,0


In [26]:
print(df_train.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Pclass-1', 'Pclass-2',
       'Pclass-3', 'is_male', 'sibsp__0', 'sibsp__1', 'sibsp__2', 'sibsp__3',
       'sibsp__4', 'sibsp__5', 'sibsp__8', 'parch__0', 'parch__1', 'parch__2',
       'parch__3', 'parch__4', 'parch__5', 'parch__6', 'embarked__C',
       'embarked__Q', 'embarked__S'],
      dtype='object')


In [27]:
sel_cols = ['Age', 'Pclass-1', 'Pclass-2', 'Pclass-3', 'is_male', 'sibsp__0', 'sibsp__1', 
            'sibsp__2', 'sibsp__3',
       'sibsp__4', 'sibsp__5', 'sibsp__8', 'parch__0', 'parch__1', 'parch__2',
       'parch__3', 'parch__4', 'parch__5', 'parch__6', 'embarked__C',
       'embarked__Q', 'embarked__S']

## 2.训练模型

### 2.1 LR模型

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

data_train_total = df_train[sel_cols].values
label_train_total = df_train['Survived'].values
data_test = df_test[sel_cols].values

data_train, data_vaild, label_train, label_vaild = train_test_split(data_train_total, label_train_total, test_size=0.3, random_state=1)

print(data_train.shape)
print(label_train.shape)
print(data_vaild.shape)
print(label_vaild.shape)

(623, 22)
(623,)
(268, 22)
(268,)


In [32]:
scaler = StandardScaler()
scaler.fit(data_train)

data_train = scaler.transform(data_train)
data_vaild = scaler.transform(data_vaild)
data_test = scaler.transform(data_test)

In [38]:
clf = LogisticRegression(penalty='l2', class_weight={1:1, 0:1}, solver='liblinear', tol=1e-4, C=1.0)
clf.fit(data_train, label_train)

y_train_pred = clf.predict(data_train)
y_vaild_pred = clf.predict(data_vaild)
y_test_pred = clf.predict(data_test)

In [39]:
print(y_train_pred)

[1 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 0
 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 1 0 0
 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 0 1 0 1 0 0 0 1 1 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0
 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0
 1 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0
 1 1 1 1 0 1 1 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0 1 0 0
 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0
 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1
 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0
 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 1 1 0 0 0 0 1 1 

In [40]:
print(sum(y_train_pred))

213


In [41]:
# 评测效果
def evalute(y_label, y_pred):
    n = len(y_label)
    correct = 0
    for i in range(0, len(y_label)):
        if y_label[i]==y_pred[i]:
            correct = correct + 1
    return correct/n*100


acc_train = evalute(y_train_pred, label_train)
acc_vaild = evalute(y_vaild_pred, label_vaild)

print('training accuracy: %f%%' % (acc_train))
print('vaild accuracy: %f%%' % (acc_vaild))

training accuracy: 82.343499%
vaild accuracy: 76.492537%


In [42]:
# 生成提交文件
df_res = pd.DataFrame()
df_res['PassengerId'] = df_test['PassengerId']
df_res['Survived'] = y_test_pred

print(df_res.shape)
df_res.head()

(418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [43]:
df_res.to_csv('data/lr_result.csv', index=False)
print('finished')

finished
