## In this notebook we will work on titanic dataset, without using pipeline 

In [1]:
#@ Importing the required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer             # for missing data
from sklearn.preprocessing import OneHotEncoder      # for categorical data
from sklearn.preprocessing import MinMaxScaler       # for scaling 
from sklearn.tree import DecisionTreeClassifier      # for prediction

In [2]:
#@ Loading and reading the dataset
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# removing some unnecessary columns
data.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Our aim is that using all data, we have to find that the passenger is survived or not........

In [4]:
#@ doing train/test/split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns = ['Survived']),
                                                   data['Survived'],
                                                   test_size = 0.3,
                                                   random_state = 42)

In [5]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
445,1,male,4.0,0,2,81.8583,S
650,3,male,,0,0,7.8958,S
172,3,female,1.0,1,1,11.1333,S
450,2,male,36.0,1,2,27.75,S
314,2,male,43.0,1,1,26.25,S


In [6]:
y_train.head()

445    1
650    0
172    1
450    0
314    0
Name: Survived, dtype: int64

In [7]:
# checking the missing values
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Here, in our dataset there is some missing values so applying imputation we fill those missing values.

for age columns we fill it by finding the mean value
for embarked column we fill it by finding the most frequent value

In [8]:
#@ Applying Imputation

si_age = SimpleImputer()                                # creating an object for age column
si_embarked = SimpleImputer(strategy = 'most_frequent') # creating an object for an embarked column

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [9]:
# So, it fill the mean values where missing values contains

X_train_age

array([[ 4.        ],
       [29.25635271],
       [ 1.        ],
       [36.        ],
       [43.        ],
       [38.        ],
       [31.        ],
       [29.        ],
       [18.        ],
       [29.25635271],
       [39.        ],
       [39.        ],
       [26.        ],
       [20.        ],
       [29.25635271],
       [49.        ],
       [23.        ],
       [ 1.        ],
       [29.25635271],
       [29.25635271],
       [ 3.        ],
       [ 4.        ],
       [19.        ],
       [40.5       ],
       [ 1.        ],
       [29.25635271],
       [21.        ],
       [54.        ],
       [29.25635271],
       [25.        ],
       [26.        ],
       [22.        ],
       [31.        ],
       [49.        ],
       [19.        ],
       [24.        ],
       [16.        ],
       [25.        ],
       [39.        ],
       [47.        ],
       [60.        ],
       [24.        ],
       [29.25635271],
       [27.        ],
       [29.25635271],
       [38

Here, some of the columns are categorical i.e.
    
    - Sex
    - Embarked

So, we will use one-hot encoding

In [10]:
# one hot encoding 

ohe_sex = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
ohe_embarked = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [11]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
445,1,male,4.0,0,2,81.8583,S
650,3,male,,0,0,7.8958,S
172,3,female,1.0,1,1,11.1333,S
450,2,male,36.0,1,2,27.75,S
314,2,male,43.0,1,1,26.25,S


In [12]:
X_train_sex

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [0., 1.]])

In [13]:
X_train_embarked

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [14]:
#@ droping the sex, age, embarked and store remaining columns into the new dataframe
X_train_remain = X_train.drop(columns = ['Sex', 'Age', 'Embarked'])
X_test_remain = X_test.drop(columns = ['Sex', 'Age', 'Embarked'])

In [15]:
# Now, concatenate

X_train_transformed = np.concatenate((X_train_remain, X_train_age, X_train_sex, X_train_embarked), axis = 1)
X_test_transformed = np.concatenate((X_test_remain, X_test_age, X_test_sex, X_test_embarked), axis = 1)

In [16]:
X_train_transformed

array([[1., 0., 2., ..., 0., 0., 1.],
       [3., 0., 0., ..., 0., 0., 1.],
       [3., 1., 1., ..., 0., 0., 1.],
       ...,
       [3., 2., 0., ..., 0., 0., 1.],
       [1., 1., 2., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.]])

In [17]:
X_train_transformed.shape

(623, 10)

In [18]:
clf = DecisionTreeClassifier()            # creating an object for decision tree
clf.fit(X_train_transformed, y_train)

#@ Here, our model is trained and ready to predict 

DecisionTreeClassifier()

In [19]:
#@ Predicting the data
y_predict = clf.predict(X_test_transformed)         # storing the value inside the 'y_pred'
y_predict

array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0], dtype=int64)

In [20]:
#@ calculating the accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.7611940298507462

Now, if we want to predict the data from the given input by the users.

If we want to export the model for deploying we wil use 'pickle' library

In [22]:
import pickle
pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))
pickle.dump(clf, open('models/clf.pkl', 'wb'))