# Setting up this notebook

In [1]:
# to support python 2 and python 3
from __future__ import division, print_function, unicode_literals

import os
import numpy as np
# to make this notebook's output stable across runs
np.random.seed(42)

# to make plots
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

HOUSING_PATH = "datasets/titanic"
# getting a dataframe object from the csv file
import pandas as pd
def load_housing_data(path,housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, path)
    return pd.read_csv(csv_path)
titanic = load_housing_data(path = "train.csv")
titanic_test = load_housing_data(path = "test.csv")

In [2]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
titanic = titanic.drop("Name",axis = 1)
titanic = titanic.drop("Ticket",axis = 1)
titanic = titanic.drop("Cabin",axis = 1)
titanic = titanic.drop("PassengerId",axis = 1)
titanic_test = titanic_test.drop("Name",axis = 1)
titanic_test = titanic_test.drop("Ticket",axis = 1)
titanic_test = titanic_test.drop("Cabin",axis = 1)
titanic_test = titanic_test.drop("PassengerId",axis = 1)

In [4]:
codes, uniques = pd.factorize(pd.concat([titanic['Sex'], titanic_test['Sex']]))
titanic['Sex'] = codes[:len(titanic)]
titanic_test['Sex'] = codes[len(titanic):]
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,S
1,1,1,1,38.0,1,0,71.2833,C
2,1,3,1,26.0,0,0,7.9250,S
3,1,1,1,35.0,1,0,53.1000,S
4,0,3,0,35.0,0,0,8.0500,S
5,0,3,0,,0,0,8.4583,Q
6,0,1,0,54.0,0,0,51.8625,S
7,0,3,0,2.0,3,1,21.0750,S
8,1,3,1,27.0,0,2,11.1333,S
9,1,2,1,14.0,1,0,30.0708,C


In [5]:
codes, uniques = pd.factorize(pd.concat([titanic['Embarked'], titanic_test['Embarked']]))
titanic['Embarked'] = codes[:len(titanic)]
titanic_test['Embarked'] = codes[len(titanic):]
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.2500,0
1,1,1,1,38.0,1,0,71.2833,1
2,1,3,1,26.0,0,0,7.9250,0
3,1,1,1,35.0,1,0,53.1000,0
4,0,3,0,35.0,0,0,8.0500,0
5,0,3,0,,0,0,8.4583,2
6,0,1,0,54.0,0,0,51.8625,0
7,0,3,0,2.0,3,1,21.0750,0
8,1,3,1,27.0,0,2,11.1333,0
9,1,2,1,14.0,1,0,30.0708,1


In [6]:
titanic.mean()
titanic["Age"].fillna(titanic.mean()["Age"],inplace = True)
titanic_test["Age"].fillna(titanic.mean()["Age"],inplace = True)
titanic


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.000000,1,0,7.2500,0
1,1,1,1,38.000000,1,0,71.2833,1
2,1,3,1,26.000000,0,0,7.9250,0
3,1,1,1,35.000000,1,0,53.1000,0
4,0,3,0,35.000000,0,0,8.0500,0
5,0,3,0,29.699118,0,0,8.4583,2
6,0,1,0,54.000000,0,0,51.8625,0
7,0,3,0,2.000000,3,1,21.0750,0
8,1,3,1,27.000000,0,2,11.1333,0
9,1,2,1,14.000000,1,0,30.0708,1


In [7]:
X,Y =titanic.drop("Survived",axis=1) ,titanic['Survived']

In [8]:
Y.shape

(891,)

In [9]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(titanic, test_size = 0.2 , random_state = 42)
print(len(train_set), len(test_set))

712 179


In [10]:
X_train,y_train = train_set.drop("Survived",axis = 1),train_set['Survived']
X_test, y_test =  test_set.drop("Survived",axis = 1),test_set['Survived']

In [11]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,0,45.500000,0,0,28.5000,0
733,2,0,23.000000,0,0,13.0000,0
382,3,0,32.000000,0,0,7.9250,0
704,3,0,26.000000,1,0,7.8542,0
813,3,1,6.000000,4,2,31.2750,0
118,1,0,24.000000,0,1,247.5208,1
536,1,0,45.000000,0,0,26.5500,0
361,2,0,29.000000,1,0,27.7208,1
29,3,0,29.699118,0,0,7.8958,0
55,1,0,29.699118,0,0,35.5000,0


In [12]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, verbose=0,
       warm_start=False)

In [13]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

array([ 0.62605042,  0.67510549,  0.62447257])

In [14]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
cross_val_score(forest_clf, X_train, y_train, cv=3,
                                    scoring = "accuracy")

array([ 0.79831933,  0.78481013,  0.8185654 ])

In [15]:
forest_clf.fit(X_train,y_train)
predictions = forest_clf.predict(X_test)

In [16]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.78770949720670391

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=8)
cross_val_score(knn_clf, X_train, y_train, cv=3,
                                    scoring = "accuracy")


array([ 0.70168067,  0.71729958,  0.74261603])

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_neighbors': [2, 4, 6, 8,10]},
  ]
grid_search = GridSearchCV(knn_clf, param_grid, cv=5,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=8, p=2,
           weights=u'distance'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{u'n_neighbors': [2, 4, 6, 8, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=u'neg_mean_squared_error', verbose=0)

In [19]:
grid_search.best_params_

{u'n_neighbors': 8}

In [20]:
knn_clf.fit(X_train,y_train)
predictions =knn_clf.predict(X_test)

In [21]:
accuracy_score(y_test,predictions)

0.73743016759776536

In [22]:
forest_clf.fit(X,Y)
pred_for =forest_clf.predict(X_test)
accuracy_score(y_test,pred_for)

0.97206703910614523

In [23]:
np.where(pd.isnull(titanic_test))

(array([152]), array([5]))

In [25]:
titanic_test["Fare"].fillna(titanic.mean()["Fare"],inplace = True)

In [26]:
predict = forest_clf.predict(titanic_test)

In [27]:
submission = pd.DataFrame({
        "PassengerId": load_housing_data(path = "test.csv")["PassengerId"],
        "Survived": predict
    })

In [None]:
csv_path = os.path.join(HOUSING_PATH, "submission.csv")
submission.to_csv(csv_path, index=False)


In [37]:
import tensorflow as tf
config = tf.contrib.learn.RunConfig(tf_random_seed = 42)

feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300,300,300], n_classes = 10,
                                        feature_columns = feature_cols)
dnn_clf = tf.contrib.learn.SKCompat(dnn_clf)
dnn_clf.fit(X_train,y_train,batch_size = 50 , steps = 40000)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 0, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f0d70c40fd0>, '_model_dir': '/tmp/tmphfUNpl', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': None, '_tf_random_seed': None, '_save_summary_steps': 100, '_environment': 'local', '_num_worker_replicas': 0, '_task_id': 0, '_log_step_count_steps': 100, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_evaluation_master': '', '_master': ''}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmphfUNpl/model.ckpt.
INFO:tensorflow:loss = 3.32333, step = 1
INFO:tensorflow:global_step/sec: 245.402
INFO:tensorflow:loss = 0.634354, step = 101 (0.410 sec)
INFO:tensorflow:global_step/sec: 222.719
INFO:tensorflow:loss = 0.70

INFO:tensorflow:global_step/sec: 246.328
INFO:tensorflow:loss = 0.252832, step = 7401 (0.405 sec)
INFO:tensorflow:global_step/sec: 255.682
INFO:tensorflow:loss = 0.419448, step = 7501 (0.391 sec)
INFO:tensorflow:global_step/sec: 255.683
INFO:tensorflow:loss = 0.294777, step = 7601 (0.391 sec)
INFO:tensorflow:global_step/sec: 254.556
INFO:tensorflow:loss = 0.197757, step = 7701 (0.393 sec)
INFO:tensorflow:global_step/sec: 255.62
INFO:tensorflow:loss = 0.254186, step = 7801 (0.391 sec)
INFO:tensorflow:global_step/sec: 249.293
INFO:tensorflow:loss = 0.408204, step = 7901 (0.401 sec)
INFO:tensorflow:global_step/sec: 246.077
INFO:tensorflow:loss = 0.367325, step = 8001 (0.407 sec)
INFO:tensorflow:global_step/sec: 243.313
INFO:tensorflow:loss = 0.23909, step = 8101 (0.411 sec)
INFO:tensorflow:global_step/sec: 248.182
INFO:tensorflow:loss = 0.327814, step = 8201 (0.403 sec)
INFO:tensorflow:global_step/sec: 254.717
INFO:tensorflow:loss = 0.267023, step = 8301 (0.393 sec)
INFO:tensorflow:global

INFO:tensorflow:loss = 0.173682, step = 15701 (0.567 sec)
INFO:tensorflow:global_step/sec: 182.884
INFO:tensorflow:loss = 0.171692, step = 15801 (0.547 sec)
INFO:tensorflow:global_step/sec: 240.963
INFO:tensorflow:loss = 0.260209, step = 15901 (0.415 sec)
INFO:tensorflow:global_step/sec: 192.726
INFO:tensorflow:loss = 0.166079, step = 16001 (0.519 sec)
INFO:tensorflow:global_step/sec: 243.03
INFO:tensorflow:loss = 0.113388, step = 16101 (0.411 sec)
INFO:tensorflow:global_step/sec: 248.985
INFO:tensorflow:loss = 0.216497, step = 16201 (0.401 sec)
INFO:tensorflow:global_step/sec: 242.076
INFO:tensorflow:loss = 0.248984, step = 16301 (0.413 sec)
INFO:tensorflow:global_step/sec: 195.955
INFO:tensorflow:loss = 0.152577, step = 16401 (0.510 sec)
INFO:tensorflow:global_step/sec: 180.988
INFO:tensorflow:loss = 0.176617, step = 16501 (0.555 sec)
INFO:tensorflow:global_step/sec: 164.207
INFO:tensorflow:loss = 0.254844, step = 16601 (0.608 sec)
INFO:tensorflow:global_step/sec: 167.232
INFO:tensor

INFO:tensorflow:loss = 0.164285, step = 24001 (0.479 sec)
INFO:tensorflow:global_step/sec: 235.14
INFO:tensorflow:loss = 0.133573, step = 24101 (0.425 sec)
INFO:tensorflow:global_step/sec: 253.129
INFO:tensorflow:loss = 0.072638, step = 24201 (0.395 sec)
INFO:tensorflow:global_step/sec: 235.364
INFO:tensorflow:loss = 0.0754976, step = 24301 (0.426 sec)
INFO:tensorflow:global_step/sec: 249.922
INFO:tensorflow:loss = 0.166121, step = 24401 (0.399 sec)
INFO:tensorflow:global_step/sec: 247.421
INFO:tensorflow:loss = 0.182792, step = 24501 (0.405 sec)
INFO:tensorflow:global_step/sec: 246.215
INFO:tensorflow:loss = 0.0783741, step = 24601 (0.405 sec)
INFO:tensorflow:global_step/sec: 248.497
INFO:tensorflow:loss = 0.161554, step = 24701 (0.403 sec)
INFO:tensorflow:global_step/sec: 187.105
INFO:tensorflow:loss = 0.0818821, step = 24801 (0.534 sec)
INFO:tensorflow:global_step/sec: 230.427
INFO:tensorflow:loss = 0.162737, step = 24901 (0.435 sec)
INFO:tensorflow:global_step/sec: 210.484
INFO:ten

INFO:tensorflow:loss = 0.0639502, step = 32301 (0.410 sec)
INFO:tensorflow:global_step/sec: 241.899
INFO:tensorflow:loss = 0.119813, step = 32401 (0.412 sec)
INFO:tensorflow:global_step/sec: 247.898
INFO:tensorflow:loss = 0.11623, step = 32501 (0.405 sec)
INFO:tensorflow:global_step/sec: 247.962
INFO:tensorflow:loss = 0.120416, step = 32601 (0.402 sec)
INFO:tensorflow:global_step/sec: 239.706
INFO:tensorflow:loss = 0.251687, step = 32701 (0.417 sec)
INFO:tensorflow:global_step/sec: 244.37
INFO:tensorflow:loss = 0.245373, step = 32801 (0.409 sec)
INFO:tensorflow:global_step/sec: 239.366
INFO:tensorflow:loss = 0.137756, step = 32901 (0.418 sec)
INFO:tensorflow:global_step/sec: 254.065
INFO:tensorflow:loss = 0.107716, step = 33001 (0.395 sec)
INFO:tensorflow:global_step/sec: 247.945
INFO:tensorflow:loss = 0.150147, step = 33101 (0.402 sec)
INFO:tensorflow:global_step/sec: 253.097
INFO:tensorflow:loss = 0.18908, step = 33201 (0.395 sec)
INFO:tensorflow:global_step/sec: 251.705
INFO:tensorf

SKCompat()

In [38]:
pred_for =dnn_clf.predict(X_test)
accuracy_score(y_test,pred_for['classes'])

INFO:tensorflow:Restoring parameters from /tmp/tmphfUNpl/model.ckpt-40000


0.78212290502793291

In [39]:
predict = dnn_clf.predict(titanic_test)
submission = pd.DataFrame({
        "PassengerId": load_housing_data(path = "test.csv")["PassengerId"],
        "Survived": predict['classes']
    })
csv_path = os.path.join(HOUSING_PATH, "submission_dnn.csv")
submission.to_csv(csv_path, index=False)

INFO:tensorflow:Restoring parameters from /tmp/tmphfUNpl/model.ckpt-40000


In [51]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
poly_scaler = Pipeline([
        ("poly_features", PolynomialFeatures(degree=3, include_bias=False)),
        ("std_scaler", StandardScaler()),
    ])

X_train_poly_scaled = poly_scaler.fit_transform(X_train)
X_val_poly_scaled = poly_scaler.transform(X_test)

from sklearn.base import clone

sgd_reg = SGDRegressor(n_iter = 1,penalty = None,learning_rate = "constant",eta0 = 0.0005,warm_start = True)

min_val_error = float("inf")
best_epoch = None
best_model = None
print('begin')
for epoch in range(1000):
    print(epoch ,":epoch")
    sgd_reg.fit(X_train_poly_scaled,y_train)
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    val_error = mean_squared_error(y_val_predict,y_test)
    if val_error < min_val_error:
        min_val_error = val_error
        best_epoch = epoch
        print(best_epoch,"error: ",min_val_error)
        best_model = clone(sgd_reg)
best_epoch,best_model


begin
0 :epoch
0 error:  0.219244137078
1 :epoch
1 error:  0.177013236137
2 :epoch
2 error:  0.162284039595
3 :epoch
3 error:  0.152602073455
4 :epoch
4 error:  0.147741998657
5 :epoch
5 error:  0.141745581506
6 :epoch
7 :epoch
8 :epoch
8 error:  0.139288834164
9 :epoch
10 :epoch
10 error:  0.138549131505
11 :epoch
12 :epoch
13 :epoch
14 :epoch
14 error:  0.137965963899
15 :epoch
16 :epoch
17 :epoch
18 :epoch
19 :epoch
20 :epoch
21 :epoch
21 error:  0.137365787881
22 :epoch
23 :epoch
24 :epoch
24 error:  0.137060022227
25 :epoch
26 :epoch
27 :epoch
28 :epoch
29 :epoch
30 :epoch
31 :epoch
32 :epoch
33 :epoch
34 :epoch
35 :epoch
36 :epoch
37 :epoch
38 :epoch
39 :epoch
40 :epoch
41 :epoch
42 :epoch
43 :epoch
44 :epoch
44 error:  0.136644225158
45 :epoch
46 :epoch
47 :epoch
48 :epoch
48 error:  0.136148832591
49 :epoch
50 :epoch
51 :epoch
52 :epoch
53 :epoch
54 :epoch
55 :epoch
56 :epoch
57 :epoch
58 :epoch
59 :epoch
60 :epoch
60 error:  0.136052433153
61 :epoch
62 :epoch
63 :epoch
64 :epo

761 :epoch
762 :epoch
763 :epoch
764 :epoch
765 :epoch
766 :epoch
767 :epoch
768 :epoch
769 :epoch
770 :epoch
771 :epoch
772 :epoch
773 :epoch
774 :epoch
775 :epoch
776 :epoch
777 :epoch
778 :epoch
779 :epoch
780 :epoch
781 :epoch
782 :epoch
783 :epoch
784 :epoch
785 :epoch
786 :epoch
787 :epoch
788 :epoch
789 :epoch
790 :epoch
791 :epoch
792 :epoch
793 :epoch
794 :epoch
795 :epoch
796 :epoch
797 :epoch
798 :epoch
799 :epoch
800 :epoch
801 :epoch
802 :epoch
803 :epoch
804 :epoch
805 :epoch
806 :epoch
807 :epoch
808 :epoch
809 :epoch
810 :epoch
811 :epoch
812 :epoch
813 :epoch
814 :epoch
815 :epoch
816 :epoch
817 :epoch
818 :epoch
819 :epoch
820 :epoch
821 :epoch
822 :epoch
823 :epoch
824 :epoch
825 :epoch
826 :epoch
827 :epoch
828 :epoch
829 :epoch
830 :epoch
831 :epoch
832 :epoch
833 :epoch
834 :epoch
835 :epoch
836 :epoch
837 :epoch
838 :epoch
839 :epoch
840 :epoch
841 :epoch
842 :epoch
843 :epoch
844 :epoch
845 :epoch
846 :epoch
847 :epoch
848 :epoch
849 :epoch
850 :epoch
851 :epoch

(74, SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.0005,
        fit_intercept=True, l1_ratio=0.15, learning_rate=u'constant',
        loss='squared_loss', n_iter=1, penalty=None, power_t=0.25,
        random_state=None, shuffle=True, verbose=0, warm_start=True))

In [55]:
best_model.fit(X_train,y_train)
pred_for =best_model.predict(X_test)
accuracy_score(y_test,pred_for)

ValueError: Can't handle mix of binary and continuous