In [522]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
combined = train.append(test)
combined = combined.reset_index().drop(['index'], axis=1)
train_original = train.copy()
test_original = test.copy()
combined.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [523]:
%matplotlib inline
#import matplotlib.pyplot as plt
#train.hist(bins=50, figsize=(20,15))
#plt.show()

In [524]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

pipeline_sex1 = ColumnTransformer([
        ("sex",  OrdinalEncoder(), ['Sex'])
    ])

combined['Sex'] = pipeline_sex1.fit_transform(combined)

In [525]:
def cabin_data(df):
    c = df['Cabin']
    c = c.str.extract(r'([a-zA-z]\d{0,3})', expand=False).fillna(value="U").str.split('(\d+)', expand=True)
    c = c.drop(2, axis=1)
    c[1] = c[1].fillna(value="0").astype(int).replace(0,np.nan)
    c[1] = c[1].replace(np.nan, 0)#c[1].max()/2)
    c = c.rename({0:'Deck', 1:'Room'}, axis=1)
    df = df.drop('Cabin', axis=1)
    df = pd.concat([df, c], axis=1)
    return df

def ticket_data(df):
    t = df['Ticket'].str.split(pat=r'(\d+$)', expand=True).sort_values(by=0, ascending=False)
    t[0] = t[0].str.replace(r"[^a-zA-Z\d]+", "").str.upper()
    t = t.replace({'CASOTON':'CA','SCAHBASLE':'SC', 'SCPARIS':'SC', 'SOTONO2':'SOTONOQ', 'STONO2':'SOTONOQ', 'SCA4':'A4', 'STONOQ':'SOTONOQ', 'SCAH':'SC', 'SCOW':'SC', 'SCA3':'SC', 'SOP':'SOPP'}).drop(2, axis=1)
    small_count = t[0].value_counts()[t[0].value_counts() < 20].keys()
    other_dict = {k:"Other code" for k in small_count}
    t = t.replace(other_dict)
    t = t.rename({0:'Ticket code', 1:'Ticket number'}, axis=1)
    df = df.drop('Ticket', axis=1)
    df = pd.concat([df, t], axis=1)
    df['Ticket number'] = df['Ticket number'].fillna(0).astype(int)
    df['Ticket code'] = df['Ticket code'].replace("", "Other code")
    return df

def name_data(df):
    df = df.join(df['Name'].str.split(pat=",", expand=True))
    df0 = df[1].str.split(pat=r'([a-zA-z]+\.)', expand=True, n=1).drop(0, axis=1)
    df = df.drop(1, axis=1).join(df0)
    df = df.rename(columns={0: 'Surname', 1: 'Title'})
    df1 = df[2].str.split(pat=r'(\([a-zA-z ]+\))', expand=True).drop(2, axis=1)
    df = df.drop(2, axis=1).join(df1)
    df = df.rename(columns={0: 'First name', 1: 'First name2'}).drop('Name', axis=1)
    df['First name'] = df['First name'].str.strip()
    df['First name2'] = df['First name2'].fillna("None")
    return df

def married(df):
    women = df[df['First name2'] != "None"][df['Sex'] ==0][df['SibSp'] > 0]
    men = df[df['Sex'] ==1][df['SibSp'] > 0][(df['Age'] >=16) | (df['Age'].isna())]
    men_women =pd.concat([women,men])
    married = (men_women.groupby(['Surname','First name']).size() == 2)
    df = df.set_index(['Surname', 'First name'])
    df['Spouse'] = married.astype('int')
    df['Spouse'] = df['Spouse'].fillna(0)
    df['Sibling'] = df['SibSp'] - df['Spouse']
    df = df.reset_index()
    age_fill = df[df['Spouse'] == 1].groupby(['Surname','First name'])['Age'].apply(lambda x: x.ffill().bfill())
    df.loc[age_fill.index]['Age']  = age_fill.values
    df['Age'].update(age_fill)
    df = df.drop('SibSp', axis=1)
    #df = df.drop(['Spouse', 'Sibling'], axis=1)
    return df

def parents(df):
    children = df[df['Age']<16]
    children['Parents'] = children['Parch']
    children['Parch'] = 0
    children = children[['Parch', 'Parents']]

    df['Parents'] = 0
    df['Parch'].update(children['Parch'])
    df['Parents'].update(children['Parents'])
    return df

def ticket_range(df):
    train_tickets = df[df['Ticket number'].notnull()]['Ticket number'].astype(int)
    bins = np.arange(0,train_tickets.max() + 10, 10)
    train_tickets = pd.cut(train_tickets, bins)
    df['Ticket range'] = train_tickets.astype(str)
    return df

def family(df):
    family_count = df.groupby(['Surname', 'Ticket range']).size()
    df = df.reset_index()
    df = df.sort_values(['Surname']).set_index(['Surname', 'Ticket range', 'PassengerId'])
    df['Family count'] = family_count-1
    tindex= df[df['Family count'] >0]['Family count'].index
    count_update = df.loc[tindex.unique()]['Family count']
    df['Family count'].update(count_update)
    df['Family count2'] = df['Parch'] + df['Spouse'] + df['Parents'] + df['Sibling']
    df['Family count'] = df[['Family count', 'Family count2']].max(axis=1)
    #df['Family count'] = df['Family count'] -  df['Parch'] - df['Spouse'] - df['Parents'] - df['Sibling']
    df = df.drop('Family count2', axis=1)
    return df

def calculate_fare(df):
    df['Ticket number'] = df['Ticket number'].astype(int)
    tt = df.groupby('Ticket number').size()
    df = df.sort_values('Ticket number')
    df = df.set_index('Ticket number')
    df['Ticket count'] = tt
    df['Fare'] = df['Fare']/df['Ticket count']
    df = df.reset_index()
    df['Fare'] = df['Fare'].replace(0, np.nan)
    grouped = df.groupby('Pclass').Fare
    df['Fare'] = grouped.apply(lambda x: x.fillna(x.mean()))
    return df

def friends(df):
    df['Ticket count'] = df['Ticket count'] - df['Family count'] - 1
    df['Ticket count'] = df['Ticket count'].clip(lower=0)
    return df

In [526]:
combined = cabin_data(combined)
combined = ticket_data(combined)
combined = name_data(combined)
combined = married(combined)
combined = calculate_fare(combined)

#combined['Title'] = combined['Title'].replace({'Col.':'Military','Major.':'Military', 'Capt.':'Military', 'Countess.':'Nobility', 'Sir.':'Nobility', 'Mlle.':'Miss.', 'Ms.':'Miss.', 'Mme.':'Mrs.', 'Don.':'Mr.', 'Dona.':'Mrs.', 'Lady.':'Nobility', 'Jonkheer.':'Nobility'})
combined['Title'] = combined['Title'].replace({'Col.':'Other title','Major.':'Other title', 'Capt.':'Other title', 'Countess.':'Other title', 'Sir.':'Other title', 'Mlle.':'Miss.', 'Ms.':'Miss.', 'Mme.':'Mrs.', 'Don.':'Mr.', 'Dona.':'Mrs.', 'Lady.':'Other title', 'Jonkheer.':'Other title', 'Rev.':'Other title', 'Dr.':'Other title'})

grouped = combined.groupby(['Sex','Pclass', 'Title'])  
#grouped.Age.median()
combined.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))

combined = parents(combined)
combined = ticket_range(combined)
combined = family(combined)
combined = friends(combined)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [527]:
combined = combined.sort_values(by = 'PassengerId')
combined = combined.reset_index()

In [528]:
combined[combined['Ticket count'] > 0]

Unnamed: 0,Surname,Ticket range,PassengerId,index,Ticket number,First name,Age,Embarked,Fare,Parch,...,Deck,Room,Ticket code,Title,First name2,Spouse,Sibling,Ticket count,Parents,Family count
6,McCarthy,"(17460.0, 17470.0]",7,367,17463,Timothy J,54.0,S,25.931250,0,...,E,46.0,Other code,Mr.,,0.0,0.0,1.0,0,0.0
20,Fynney,"(239860.0, 239870.0]",21,812,239865,Joseph J,35.0,S,13.000000,0,...,U,0.0,Other code,Mr.,,0.0,0.0,1.0,0,0.0
31,Spencer,"(17560.0, 17570.0]",32,395,17569,William Augustus,57.0,C,48.840267,0,...,B,78.0,PC,Mrs.,(Marie Eugenie),1.0,0.0,1.0,0,1.0
50,Panula,"(3101290.0, 3101300.0]",51,1290,3101295,Juha Niilo,7.0,S,5.669643,0,...,U,0.0,Other code,Master.,,0.0,4.0,1.0,1,5.0
52,Harper,"(17570.0, 17580.0]",53,397,17572,Henry Sleeper,49.0,C,25.576400,0,...,D,33.0,PC,Mrs.,(Myna Haxtun),1.0,0.0,1.0,0,1.0
61,Icard,"(113570.0, 113580.0]",62,702,113572,Amelie,38.0,,40.000000,0,...,B,28.0,Other code,Miss.,,0.0,0.0,1.0,0,0.0
72,Hood,"(14870.0, 14880.0]",73,346,14879,Ambrose Jr,21.0,S,10.500000,0,...,U,0.0,Other code,Mr.,,0.0,0.0,6.0,0,0.0
74,Bing,"(1600.0, 1610.0]",75,27,1601,Lee,32.0,S,7.061975,0,...,U,0.0,Other code,Mr.,,0.0,0.0,7.0,0,0.0
79,Dowdell,"(364510.0, 364520.0]",80,1151,364516,Elizabeth,30.0,S,6.237500,0,...,U,0.0,Other code,Miss.,,0.0,0.0,1.0,0,0.0
109,Moran,"(371100.0, 371110.0]",110,1203,371110,Bertha,18.0,Q,8.050000,0,...,U,0.0,Other code,Miss.,,0.0,1.0,1.0,0,1.0


In [529]:

train_full_y = combined['Survived'][0:891]
test_passenger = combined['PassengerId'][891:]


combined = combined.drop(['Survived','Surname', 'Ticket range', 'PassengerId', 'First name', 'Ticket number', 'First name2', 'index', 'Ticket code',  'Spouse'], axis = 1)

In [530]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

cat_attribs = ['Title', 'Deck', 'Embarked']
num_attribs = ["Pclass", 'Sex', 'Age', 'Room', 'Family count', 'Fare', 'Sibling', 'Parents', 'Parch', 'Ticket count']

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ("encode", OneHotEncoder()),
    ])

num_pipeline = Pipeline([
        #('std_scaler', StandardScaler()),
        ('minmax_scaler', StandardScaler()),
    ])

pipeline = ColumnTransformer([
        ("cat", cat_pipeline, cat_attribs),
        ("num", num_pipeline, num_attribs),

    ])

combined_prepared = pipeline.fit_transform(combined)

In [531]:
train_full_prepared = combined_prepared[0:891]
train_prepared = train_full_prepared[0:800]
val_prepared = train_full_prepared[800:891]
test_prepared = combined_prepared[891:]

train_y = train_full_y[0:800]
val_y = train_full_y[800:891]


In [514]:
from sklearn.ensemble import RandomForestClassifier

forest_reg = RandomForestClassifier(n_estimators=100, random_state=42)
forest_reg.fit(train_prepared, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [143]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30, 100], 'max_features': [2, 4, 5]},
    # then try 6 (2×3) combinations with bootstrap set as False
    #{'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestClassifier(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='accuracy',
                           return_train_score=True)
grid_search.fit(train_prepared, train_y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{

In [144]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 10}

In [145]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=4, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [146]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.7875 {'max_features': 2, 'n_estimators': 3}
0.80375 {'max_features': 2, 'n_estimators': 10}
0.80375 {'max_features': 2, 'n_estimators': 30}
0.79875 {'max_features': 2, 'n_estimators': 100}
0.81 {'max_features': 4, 'n_estimators': 3}
0.81875 {'max_features': 4, 'n_estimators': 10}
0.8125 {'max_features': 4, 'n_estimators': 30}
0.80875 {'max_features': 4, 'n_estimators': 100}
0.805 {'max_features': 5, 'n_estimators': 3}
0.8175 {'max_features': 5, 'n_estimators': 10}
0.81125 {'max_features': 5, 'n_estimators': 30}
0.8125 {'max_features': 5, 'n_estimators': 100}


In [147]:
combined.shape

(1309, 15)

In [532]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=1000),
        'max_features': randint(low=1, high=11),
        "max_depth": [2, 3, 4, 5, None],
        'min_samples_split': randint(low=2, high=11),
        'min_samples_leaf': randint(low=1, high=11),
        "bootstrap": [True, False],
    }

rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=300, cv=5, scoring='accuracy', random_state=42)
rnd_search.fit(train_prepared, train_y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [533]:
best_params = rnd_search.best_params_
print(best_params)

{'bootstrap': False, 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 7, 'min_samples_split': 2, 'n_estimators': 387}


In [534]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.8325 {'bootstrap': True, 'max_depth': 5, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 122}
0.8275 {'bootstrap': True, 'max_depth': 4, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 872}
0.8275 {'bootstrap': False, 'max_depth': 4, 'max_features': 6, 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 344}
0.8225 {'bootstrap': False, 'max_depth': 3, 'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 253}
0.705 {'bootstrap': False, 'max_depth': 2, 'max_features': 1, 'min_samples_leaf': 10, 'min_samples_split': 4, 'n_estimators': 876}
0.8375 {'bootstrap': True, 'max_depth': 5, 'max_features': 9, 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 819}
0.8275 {'bootstrap': True, 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 7, 'min_samples_split': 3, 'n_estimators': 388}
0.80625 {'bootstrap': True, 'max_depth': 5, 'max_features': 2, 'min_samples_leaf': 1

In [535]:
feature_importances = rnd_search.best_estimator_.feature_importances_

In [536]:
cat_encoder = pipeline.named_transformers_["cat"]
attributes = list(cat_encoder.named_steps["encode"].categories_[0]) + num_attribs + list(cat_encoder.named_steps["encode"].categories_[1]) + list(cat_encoder.named_steps["encode"].categories_[2])
#num_attribs + cat_encoder.named_steps["encode"].categories_[0]
#pipeline
#cat_pipeline.named_steps["encode"] # old solution
#cat_one_hot_attribs = list(cat_encoder.categories_[0])
#attributes = num_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(0.2428517804308791, 'Mr.'),
 (0.18820580662615258, 'D'),
 (0.09168647117331129, 'T'),
 (0.07502716823924176, 'C'),
 (0.06796624982623711, 'E'),
 (0.0637664788577669, 'Miss.'),
 (0.05512176640705266, 'G'),
 (0.0514885891613714, 'F'),
 (0.04266283173683037, 'Mrs.'),
 (0.031002586094844124, 'U'),
 (0.022576243294612015, 'Parch'),
 (0.01591653592916742, 'S'),
 (0.008819915153618192, 'Other title'),
 (0.008610613788224718, 'Master.'),
 (0.007640954529707073, 'B'),
 (0.004955552980938397, 'Q'),
 (0.004804446039812118, 'A'),
 (0.004749519687252143, 'C'),
 (0.003977555990889725, 'Ticket count'),
 (0.0035166618462716707, 'Family count'),
 (0.0023885516318013296, 'Age'),
 (0.0013026829820727405, 'Room'),
 (0.0007505860870270558, 'Sex'),
 (0.00013663766006647985, 'Pclass'),
 (7.381384485167169e-05, 'Fare'),
 (0.0, 'Sibling'),
 (0.0, 'Parents')]

In [537]:
val_predict = rnd_search.predict(val_prepared)

In [538]:
rnd_search.score(val_prepared, val_y)

0.8461538461538461

In [501]:
forest_reg_full = RandomForestClassifier(max_depth=best_params['max_depth'], max_features=best_params['max_features'], n_estimators=best_params['n_estimators'], random_state=42)
forest_reg_full.fit(train_prepared, train_y)
forest_reg_full.score(train_prepared, train_y)

0.87375

In [502]:
param_distribs = {
        'n_estimators': randint(low=1, high=1000),
        'max_features': randint(low=1, high=11),
        "max_depth": [2, 3, 4, 5, None],
        'min_samples_split': randint(low=2, high=11),
        'min_samples_leaf': randint(low=1, high=11),
        "bootstrap": [True, False],
    }

rnd_search_full = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=100, cv=5, scoring='accuracy')
rnd_search_full.fit(train_full_prepared, train_full_y)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=100,
                                                    n_jobs=None,
 

In [503]:
cvres = rnd_search_full.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

0.8316498316498316 {'bootstrap': False, 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 327}
0.8271604938271605 {'bootstrap': True, 'max_depth': 4, 'max_features': 4, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 264}
0.7171717171717171 {'bootstrap': False, 'max_depth': 2, 'max_features': 1, 'min_samples_leaf': 9, 'min_samples_split': 3, 'n_estimators': 545}
0.8204264870931538 {'bootstrap': False, 'max_depth': 3, 'max_features': 8, 'min_samples_leaf': 5, 'min_samples_split': 7, 'n_estimators': 210}
0.8249158249158249 {'bootstrap': False, 'max_depth': 3, 'max_features': 7, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 662}
0.8013468013468014 {'bootstrap': False, 'max_depth': 2, 'max_features': 6, 'min_samples_leaf': 4, 'min_samples_split': 7, 'n_estimators': 102}
0.8338945005611672 {'bootstrap': True, 'max_depth': None, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators':

In [327]:
test_predict = rnd_search_full.predict(test_prepared).astype(int)

In [328]:
test_submission = pd.DataFrame({'PassengerId':test_passenger.values, 'Survived':test_predict})

test_submission.to_csv('test_submission12.csv', index=False)

In [187]:
tt = pd.read_csv("test_submission8.csv")
tt9 = pd.read_csv("test_submission10.csv")

In [9]:
aa = pd.read_csv("gender_submission.csv")

In [16]:
tt9['Survived'] = tt9['Survived'].astype(int)

In [18]:
tt9.to_csv('test_submission9.csv', index=False)

In [19]:
tt9 = pd.read_csv("test_submission9.csv")

In [20]:
tt9

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0
