In [1]:
#import the necessary packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
#load and read the associated training and test CSVs
X = pd.read_csv("train.csv", index_col='PassengerId')
X_test_full = pd.read_csv("test2.csv", index_col='PassengerId')

In [3]:
X_test_full.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,male,34.5,0,0,7.8292,Q
893,3,female,47.0,1,0,7.0,S
894,2,male,62.0,0,0,9.6875,Q
895,3,male,27.0,0,0,8.6625,S
896,3,female,22.0,1,1,12.2875,S


In [4]:
# remove the target column and add it to Y, a label column. So that the feature columns are inputs and target column is label
X.dropna(axis=0, subset=['Survived'], inplace=True)
y = X.Survived          
X.drop(['Survived'], axis=1, inplace=True)

In [5]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)


# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality which means that the number of unique entries in non-int/float type columns is less than 10
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 1000000 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# One-hot encode the data
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [6]:
X_valid.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
496,3,29.0,0,0,14.4583,0,1,1,0,0
649,3,29.0,0,0,7.55,0,1,0,0,1
279,3,7.0,4,1,29.125,0,1,0,1,0
32,1,29.0,1,0,146.5208,1,0,1,0,0
256,3,29.0,0,2,15.2458,1,0,1,0,0


In [7]:
X_test.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,34.5,0,0,7.8292,0,1,0,1,0
893,3,47.0,1,0,7.0,1,0,0,0,1
894,2,62.0,0,0,9.6875,0,1,0,1,0
895,3,27.0,0,0,8.6625,0,1,0,0,1
896,3,22.0,1,1,12.2875,1,0,0,0,1


In [8]:
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
141,3,29.0,0,2,15.2458,1,0,1,0,0
440,2,31.0,0,0,10.5,0,1,0,0,1
818,2,31.0,1,1,37.0042,0,1,1,0,0
379,3,20.0,0,0,4.0125,0,1,1,0,0
492,3,21.0,0,0,7.25,0,1,0,0,1


In [9]:

X_train.shape

(712, 10)

In [10]:
#this is not necessary for this ipynb as i am using regressor not a classifier but keep these imported so that you an build
#sequential neural networks as well to improve the accuracy.
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy

In [11]:
#i am using the XGBOOST regressor
from xgboost import XGBRegressor
#from sklearn.metrics import mean_absolute_error

model = XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=10) # Your code here

# Fit the model
model.fit(X_train, y_train, early_stopping_rounds=100, eval_set=[(X_valid, y_valid)],verbose=False) # Your code here



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=10, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [12]:
preds=model.predict(X_valid)

In [13]:
preds2=preds.round()

In [14]:
y_valid

PassengerId
496    0
649    0
279    0
32     1
256    1
      ..
781    1
838    0
216    1
834    0
373    0
Name: Survived, Length: 179, dtype: int64

In [15]:
preds3=[]
for i in preds2:
  k = int(i)
  preds3.append(k)


In [16]:
trepid = pd.DataFrame(preds3)

In [17]:
trepid

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1
...,...
174,1
175,0
176,1
177,0


In [18]:
X_test[:-5]

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,34.5,0,0,7.8292,0,1,0,1,0
893,3,47.0,1,0,7.0000,1,0,0,0,1
894,2,62.0,0,0,9.6875,0,1,0,1,0
895,3,27.0,0,0,8.6625,0,1,0,0,1
896,3,22.0,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1300,3,30.0,0,0,7.7208,1,0,0,1,0
1301,3,3.0,1,1,13.7750,1,0,0,0,1
1302,3,30.0,0,0,7.7500,1,0,0,1,0
1303,1,37.0,1,0,90.0000,1,0,0,1,0


In [19]:
preds5 = model.predict(X_test)

In [27]:
preds5  #these values have been predicted in the range 0-1 where 0 is died and 1 is survived so we can round them and convert them to int values to get a decent prediction.

array([ 0.06134295,  0.57152385, -0.02415103,  0.14863509,  0.4931194 ,
        0.15839934,  0.66837835,  0.19143239,  0.827856  ,  0.0793213 ,
        0.12081459,  0.24798226,  0.9349012 , -0.00308806,  0.89440864,
        0.89871323,  0.11625677,  0.14668095,  0.49520585,  0.63210154,
        0.25416583,  0.5434928 ,  0.9184323 ,  0.36316642,  0.9634561 ,
        0.12339714,  0.96318054,  0.14668095,  0.34706977,  0.09584266,
        0.12387514,  0.13810077,  0.46437386,  0.28258425,  0.41043818,
        0.14575508,  0.4763823 ,  0.48777625,  0.14863509,  0.28933442,
        0.12246686,  0.4302482 ,  0.08606431,  0.85011065,  0.9709488 ,
        0.18594718,  0.27685267,  0.10269812,  0.8916595 ,  0.46702194,
        0.341278  ,  0.16867194,  0.8149877 ,  0.89939284,  0.12708691,
        0.08473948,  0.08606431,  0.12541637,  0.13776678,  0.9467578 ,
        0.12913656,  0.16926223,  0.12913656,  0.7668577 ,  0.6008092 ,
        0.8580219 ,  0.7668577 ,  0.25277606,  0.324457  ,  0.90

In [28]:
preds6 = preds5.round() #rounded values

In [29]:
preds7=[]   #appending the int values of the rounded into a list preds7
for i in preds6:
  k = int(i)
  preds7.append(k)

In [30]:
krepid = pd.DataFrame(preds7) #creating a table column out of the predicted values

In [31]:
krepid

Unnamed: 0,0
0,0
1,1
2,0
3,0
4,0
...,...
413,0
414,1
415,0
416,0


In [32]:
krepid.to_csv('titan_sub2.csv')  #step to convert this dataset to a csv file and later on you can copy paste the ids to make your sample submission

In [33]:
X_test[:]  # just something to compare side by side.

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,34.5,0,0,7.8292,0,1,0,1,0
893,3,47.0,1,0,7.0000,1,0,0,0,1
894,2,62.0,0,0,9.6875,0,1,0,1,0
895,3,27.0,0,0,8.6625,0,1,0,0,1
896,3,22.0,1,1,12.2875,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1305,3,30.0,0,0,8.0500,0,1,0,0,1
1306,1,39.0,0,0,108.9000,1,0,1,0,0
1307,3,38.5,0,0,7.2500,0,1,0,0,1
1308,3,30.0,0,0,8.0500,0,1,0,0,1
