In [54]:
import pandas as pd

data = pd.read_csv('csv/train.csv', index_col='PassengerId')
test = pd.read_csv('csv/test.csv')

In [55]:
data['is_male'] = (data['Sex'] == 'male').astype(int)
data['is_female'] = (data['Sex'] == 'female').astype(int)

test['is_male'] = (test['Sex'] == 'male').astype(int)
test['is_female'] = (test['Sex'] == 'female').astype(int)

data['Age'] = data['Age'].fillna(0)
test['Age'] = test['Age'].fillna(0)

In [56]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_male,is_female
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0,1


In [57]:
data = data.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'])
test = test.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'])

In [58]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,is_male,is_female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,22.0,1,0,7.25,1,0
2,1,1,38.0,1,0,71.2833,0,1
3,1,3,26.0,0,0,7.925,0,1
4,1,1,35.0,1,0,53.1,0,1
5,0,3,35.0,0,0,8.05,1,0


In [59]:
X = data.drop(columns=['Survived'])
y = data['Survived']

In [60]:
from sklearn.model_selection import train_test_split

# заданный random_state позволит избавится от случайности при тестировании, test_size 0.3 - соотношение 30 на 70
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_predicted_train = tree.predict(X_train)

print('Train:', accuracy_score(y_train, y_predicted_train))

Train: 0.9807383627608347


In [62]:
# normal model accuracy (no regularization) 
y_predicted_test = tree.predict(X_test)
print('Test:', accuracy_score(y_test, y_predicted_test))

Test: 0.746268656716418


In [63]:
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
from IPython.display import Image  
import pydotplus

# это рисование дерево с использованием библиотеки graphviz, ее нужно устанавливать отдельно.
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, impurity=False, feature_names=['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'is_male', 'is_female'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found

In [None]:
# updated model, much better
tree = DecisionTreeClassifier(min_samples_leaf=10, max_depth=3)
tree.fit(X_train, y_train)
y_predicted_train = tree.predict(X_train)
y_predicted_test = tree.predict(X_test)

print("Train:", accuracy_score(y_train, y_predicted_train))
print("Test:", accuracy_score(y_test, y_predicted_test))

In [None]:
# random forest (I don't know why, but it's worse than a tree)
from sklearn.ensemble import RandomForestClassifier

for n_trees in [10, 25, 50, 100, 200, 500]:
    forest = RandomForestClassifier(random_state=3, n_estimators=n_trees, n_jobs=-1)
    forest.fit(X_train, y_train)
    y_predicted_train = forest.predict(X_train)
    y_predicted_test = forest.predict(X_test)

    print('-- ' * 10)
    print("Train:", accuracy_score(y_train, y_predicted_train))
    print("Test:", accuracy_score(y_test, y_predicted_test))

In [64]:
test['Pclass'] = test['Pclass'].fillna(0)
test['SibSp'] = test['SibSp'].fillna(0)
test['Parch'] = test['Parch'].fillna(0)
test['Fare'] = test['Fare'].fillna(0)
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,is_male,is_female
0,892,3,34.5,0,0,7.8292,1,0
1,893,3,47.0,1,0,7.0,0,1
2,894,2,62.0,0,0,9.6875,1,0
3,895,3,27.0,0,0,8.6625,1,0
4,896,3,22.0,1,1,12.2875,0,1


In [68]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=3, n_estimators=25, n_jobs=-1)
forest.fit(X_train, y_train)
y_predicted_test = forest.predict(test.drop(columns=['PassengerId']))


submission = pd.DataFrame({'PassengerId':test['PassengerId'],'Survived':y_predicted_test})
submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,1


In [69]:
filename = 'forest-titanic-predictions.csv'

submission.to_csv(filename, index=False)

print('Saved file: ' + filename)

Saved file: forest-titanic-predictions.csv
