# Processing without pipeline 


In [109]:
# Processing without pipeline 

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier

import pickle

def read_csv(file_name):
    data = pd.read_csv(file_name)
    data.drop(columns=['PassengerId', 'Name','Ticket', 'Cabin'], inplace=True)
    return data

def split_data(data):
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test =  train_test_split(data.drop(columns=['Survived']),
                                                        data['Survived'],
                                                        test_size=0.3,
                                                        random_state=1)
    print(X_train.head(2))
    return X_train,X_test,Y_train,Y_test


# Applying Simple Imputer on Age and Embarked column
def simple_imputer(X_train,X_test):
    si_Age = SimpleImputer()  #by deafault mean
    si_Embarked = SimpleImputer(strategy='most_frequent')

    X_train_Age = si_Age.fit_transform(X_train[['Age']])
    X_train_Embarked = si_Embarked.fit_transform(X_train[['Embarked']])

    X_test_Age = si_Age.fit_transform(X_test[['Age']])
    X_test_Embarked = si_Embarked.fit_transform(X_test[['Embarked']])
    return si_Age,si_Embarked,X_train_Age,X_train_Embarked,X_test_Age,X_test_Embarked
    
    
# Applying One hot encoding on Sex and Embarked columns
def one_hot_encoder(X_train,X_test,X_train_Embarked,X_test_Embarked):
    ohe_Sex = OneHotEncoder(sparse=False, handle_unknown='error' )  
    ohe_Embarked = OneHotEncoder(sparse=False, handle_unknown='error' )

    X_train_Sex = ohe_Sex.fit_transform(X_train[['Sex']])
    X_train_Embarked = ohe_Embarked.fit_transform(X_train_Embarked)

    X_test_Sex = ohe_Sex.fit_transform(X_test[['Sex']])
    X_test_Embarked = ohe_Embarked.fit_transform(X_test_Embarked)
    return ohe_Sex,ohe_Embarked,X_train_Sex,X_train_Embarked,X_test_Sex,X_test_Embarked
    
# Extracting features Age, Embarked, Sex and then concatenate all
def extract_concat(X_train_Age,X_train_Sex,X_train_Embarked,X_test_Age,X_test_Sex,X_test_Embarked):    
    X_train_rem = X_train.drop(columns=['Age','Embarked','Sex'])
    X_test_rem = X_test.drop(columns=['Age','Embarked','Sex'])
    #concatenate all
    X_train_new = np.concatenate((X_train_rem,X_train_Age,X_train_Sex,X_train_Embarked), axis=1)
    X_test_new = np.concatenate((X_test_rem,X_test_Age,X_test_Sex,X_test_Embarked), axis=1)
    return X_train_new, X_test_new

def decision_model(Y_train,Y_test,X_train_new,X_test_new):
    dtc = DecisionTreeClassifier()
    dtc.fit(X_train_new,Y_train)
    Y_pred = dtc.predict(X_test_new)  

    from sklearn.metrics import accuracy_score
    print("Accuracy is : ",accuracy_score(Y_test,Y_pred))
    return dtc
    

def saving_models(si_Age,si_Embarked,ohe_Sex,ohe_Embarked,dtc):    
    pickle.dump(si_Age, open('models/si_Age.pkl','wb'))
    pickle.dump(si_Embarked, open('models/si_Embarked.pkl','wb'))
    pickle.dump(ohe_Sex,open('models/ohe_Sex.pkl','wb'))
    pickle.dump(ohe_Embarked,open('models/ohe_Embarked.pkl','wb'))
    pickle.dump(dtc,open('models/dtc.pkl','wb'))
    print("Successfully saved")


    
data = read_csv('train.csv') 

X_train,X_test,Y_train,Y_test = split_data(data)

si_Age,si_Embarked,X_train_Age,X_train_Embarked,X_test_Age,X_test_Embarked= simple_imputer(X_train,X_test)

ohe_Sex,ohe_Embarked,X_train_Sex,X_train_Embarked,X_test_Sex,X_test_Embarked = one_hot_encoder(X_train,X_test,X_train_Embarked,X_test_Embarked)

X_train_new, X_test_new = extract_concat(X_train_Age,X_train_Sex,X_train_Embarked,X_test_Age,X_test_Sex,X_test_Embarked)   

dtc = decision_model(Y_train,Y_test,X_train_new,X_test_new) 

saving_models(si_Age,si_Embarked,ohe_Sex,ohe_Embarked,dtc)

     Pclass     Sex   Age  SibSp  Parch     Fare Embarked
114       3  female  17.0      0      0  14.4583        C
874       2  female  28.0      1      0  24.0000        C
Accuracy is :  0.7350746268656716
Successfully saved


# predict without pipeline

In [110]:
# predicting without pipeline
import pickle
import numpy as np

def load_pickle_and_check_result(test_input):
    si_Age = pickle.load(open('models/si_Age.pkl','rb'))
    si_Embarked = pickle.load(open('models/si_Embarked.pkl','rb'))
    ohe_Sex = pickle.load(open('models/ohe_Sex.pkl','rb'))
    ohe_Embarked = pickle.load(open('models/ohe_Embarked.pkl','rb'))
    dtc = pickle.load(open('models/dtc.pkl','rb'))
    
    test_input_Sex = ohe_Sex.transform(test_input[:,1].reshape(1,1))
    test_input_Embarked = ohe_Embarked.transform(test_input[:,-1].reshape(1,1))
    test_input_Age = test_input[:,2].reshape(1,1)
    test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]],test_input_Age,test_input_Sex,test_input_Embarked),axis=1)

    print("1 : means survive, 0 : means not survive")
    print(dtc.predict(test_input_transformed))
    
    
# Let user gave input as
#Pclass     Sex   Age  SibSp  Parch     Fare Embarked
test_input = np.array([1, 'male', 31.0, 1, 0, 20.5, 'S'],dtype=object).reshape(1,7)
load_pickle_and_check_result(test_input)



1 : means survive, 0 : means not survive
[1]
