In [1]:
import pandas as pd
train = pd.read_csv('./Titanic/train.csv')
test = pd.read_csv('./Titanic/test.csv')

### 查看数据基本信息

In [2]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [3]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


### 有些特征数据为空，需要对NaN数据做处理

In [4]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
#选取features
features = ['Pclass','Sex','Age','Embarked','SibSp','Parch','Fare']
x_train = train[features]
x_test = test[features]
y_train = train['Survived']
y_test = test

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
Embarked    889 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [8]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         332 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [9]:
#x_train上面的Embarked属性有缺失值，打印看一下
print(x_train['Embarked'].value_counts())

S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [10]:
#对于像Embarked类别类型的数据，使用出现频率最高的来填充
x_train['Embarked'].fillna('S',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [11]:
x_train['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
#x_test中的Fare使用均值填充，age也使用均值填充
x_train['Age'].fillna(x_train['Age'].mean(),inplace=True)
x_test['Age'].fillna(x_test['Age'].mean(),inplace=True)
x_test['Fare'].fillna(x_test['Fare'].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [13]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Embarked    891 non-null object
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [14]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null object
Age         418 non-null float64
Embarked    418 non-null object
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


### 使用DictVecterizer对类别数据向量化处理

In [15]:
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(x_train.to_dict(orient='record'))
X_test = dict_vec.fit_transform(x_test.to_dict(orient='record'))
#打印转换后的features
dict_vec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

### 使用随机森林和XGBoost算法

In [16]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

rfc  = RandomForestClassifier()
xgbc = XGBClassifier()

#采用5折交叉验证的方法，在默认的随机森林和XGBoost上进行性能测试
from sklearn.cross_validation import cross_val_score
print("随机森林平均准确率：%s"%(cross_val_score(rfc,X_train,y_train,cv=5).mean()))
print("XGBoost平均准确率：%s"%(cross_val_score(xgbc,X_train,y_train,cv=5).mean()))



随机森林平均准确率：0.802477101994
XGBoost平均准确率：0.818245597983


### 使用默认配置的算法进行预测

In [17]:
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
#够着DataFrame，结果保存文件
rfc_dataframe = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':rfc_predict})
rfc_dataframe.to_csv('./Titanic/rfc_predict.csv')

In [18]:
xgbc.fit(X_train,y_train)
xgbc_predict = xgbc.predict(X_test)
xgbc_dataframe = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':xgbc_predict})
xgbc_dataframe.to_csv('./Titanic/xgbc_predict.csv')

### XGBoost性能更高，使用网格搜索性能更好的超参数组合

In [19]:
from sklearn.grid_search import GridSearchCV
params = {'max_depth':list(range(2,7)),'n_estimators':list(range(100,1100,200)),'learning_rate':[0.05,0.1,0.25,0.5,1.0]}



In [20]:
xgbc_best = XGBClassifier()
gs = GridSearchCV(xgbc_best,params,n_jobs=-1,cv=5,verbose=1)
gs.fit(X_train,y_train)

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:  1.7min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_depth': [2, 3, 4, 5, 6], 'n_estimators': [100, 300, 500, 700, 900], 'learning_rate': [0.05, 0.1, 0.25, 0.5, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=1)

In [21]:
print(gs.best_score_)
print(gs.best_params_)

0.835016835016835
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}


In [22]:
xgbc_best_predict = gs.predict(X_test)
xgbc_best_dataframe = pd.DataFrame({"PassengerId":test['PassengerId'],"Survived":xgbc_best_predict})

In [23]:
xgbc_best_dataframe.to_csv('./Titanic/xgbc_best.csv')