In [1]:
from joblib import load
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

os.chdir("../lib")
import titanic_functions as tfunc



In [2]:
os.getcwd()

'D:\\DS Projects\\titanic_api\\lib'

In [2]:
model = load("../modelling/outcomes/voting_classifier_titanic.joblib")

In [2]:
df = pd.read_csv("../dataset/titanic_data.csv")
df2 = tfunc.clean_df(df)

In [5]:
# Split and transform input
sc = StandardScaler()

X = df2.drop("survived", axis=1)
sc.fit(X)
X = sc.transform(X)


In [6]:
myobs = [[2, "Barrue, Mr. Raul", "male", 33.0, 0, 0, 1111,60, "", "C"]]

In [7]:
# dummy categories: "sex", "embarked", "title", "deck"
sex = ["male", "female"]
embarked = ["S", "C", "Q"]
title = ["Mr", "Mrs", "Miss"]
deck = ["A", "B", "C", "D", "E", "F", "G", "T", "Unknown"]

In [8]:
columns = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']

In [9]:
myobs = pd.DataFrame(myobs, columns = columns)
myobs.dtypes

pclass        int64
name         object
sex          object
age         float64
sibsp         int64
parch         int64
ticket        int64
fare          int64
cabin        object
embarked     object
dtype: object

In [11]:
myobs = pd.DataFrame(myobs, columns = columns)

# Extract titles
myobs["title"] = myobs["name"].str.extract(r'(Mrs|Mr|Master|Miss|Major|Rev|Dr|Ms|Mlle|Col|Capt|Mme|Countess|Don|Jonkheer)')
myobs["title"]=myobs.apply(tfunc.replace_titles, axis=1)
myobs.drop("name", axis=1, inplace=True)

# Extracting Deck
myobs['deck'] = myobs["cabin"].str[0].fillna("Unknown")
myobs.drop("cabin", axis=1, inplace=True)

myobs["sex"] = myobs["sex"].astype(pd.CategoricalDtype(sex))
myobs["embarked"] = myobs["embarked"].astype(pd.CategoricalDtype(embarked))
myobs["title"] = myobs["title"].astype(pd.CategoricalDtype(title))
myobs["deck"] = myobs["deck"].astype(pd.CategoricalDtype(deck))

In [13]:
my_obs_clean = tfunc.clean_df(myobs, deck = False, title = False)

Any of these features are not in the dataframe: boat, body, home.dest, ticket
No NULL values in the dataframe


In [14]:
my_obs_clean.dtypes

pclass               int64
age                float64
sibsp                int64
parch                int64
ticket               int64
fare                 int64
family_size          int64
fare_per_person    float64
alone                int64
age*class          float64
dummy_male           uint8
dummy_female         uint8
dummy_S              uint8
dummy_C              uint8
dummy_Q              uint8
dummy_Mr             uint8
dummy_Mrs            uint8
dummy_Miss           uint8
dummy_A              uint8
dummy_B              uint8
dummy_C              uint8
dummy_D              uint8
dummy_E              uint8
dummy_F              uint8
dummy_G              uint8
dummy_T              uint8
dummy_Unknown        uint8
dtype: object

In [15]:
X_myobs = my_obs_clean

In [16]:
X_myobs

Unnamed: 0,pclass,age,sibsp,parch,ticket,fare,family_size,fare_per_person,alone,age*class,...,dummy_Miss,dummy_A,dummy_B,dummy_C,dummy_D,dummy_E,dummy_F,dummy_G,dummy_T,dummy_Unknown
0,2,33.0,0,0,1111,60,1,60.0,1,66.0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
model.predict_proba(X_myobs)

array([[0.25433506, 0.74566494]])

In [21]:
model.predict(X_myobs)

array([1], dtype=int64)

# Wrap up

In [2]:
def process_input(input_data, model=False):
    
    # dummy categories: "sex", "embarked", "title", "deck"
    sex = ["male", "female"]
    embarked = ["S", "C", "Q"]
    title = ["Mr", "Mrs", "Miss"]
    deck = ["A", "B", "C", "D", "E", "F", "G", "T", "Unknown"]
    
    # X columns
    columns = ['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked']
    
    # Creating X input dataframe
    X = pd.DataFrame(input_data, columns = columns)
    
    ### Data Processing
    # Extract titles
    X["title"] = X["name"].str.extract(r'(Mrs|Mr|Master|Miss|Major|Rev|Dr|Ms|Mlle|Col|Capt|Mme|Countess|Don|Jonkheer)')
    X["title"]=X.apply(tfunc.replace_titles, axis=1)
    X.drop("name", axis=1, inplace=True)

    # Extracting Deck
    X['deck'] = X["cabin"].str[0].fillna("Unknown")
    X.drop("cabin", axis=1, inplace=True)
    
    # Creating categories. Otherwise when input is just 1 observation the get_dummies func won't work
    X["sex"] = X["sex"].astype(pd.CategoricalDtype(sex))
    X["embarked"] = X["embarked"].astype(pd.CategoricalDtype(embarked))
    X["title"] = X["title"].astype(pd.CategoricalDtype(title))
    X["deck"] = X["deck"].astype(pd.CategoricalDtype(deck))
    
    # Create all the needed features. 
    # This func was used to train the model.
    # It'd be a good idea to create a new one just for this purpose since there's not mean age and stuff like that
    X = tfunc.clean_df(X, deck = False, title = False)
    
    # Predict outputs
    #y_pred = model.predict(X)
    #y_pred_proba = model.predict_proba(X)
    
    #return y_pred, y_pred_proba
    
    return X

##### Test func

In [3]:
from joblib import load
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

os.chdir("../lib")
import titanic_functions as tfunc

scaler = load("../modelling/outcomes/scaler.joblib")
model = load("../modelling/outcomes/voting_classifier_titanic.joblib")

In [14]:
df2 = pd.read_csv("../dataset/titanic_data.csv")

In [31]:
#obs = [[2, "Barrue, Mr. Raul", "male", 33, 0, 0, 1111,60, "", "C"]] # Me
#obs = [[1, "Allen, Miss. Elisabeth Walton","female", 29, 0, 0, 24160, 211.3375, "B5", "S"]] # 1
#obs = [[1, "Allison, Master. Hudson Trevor", "male", 0.92, 1, 2, 113781, 151.55, "C22 C26", "S"]] # 0
#obs = [[1, "Allison, Miss. Helen Loraine", "female", 2, 1, 2, 113781, 151.55, "C22 C26", "S"]]# 1
#obs = [[3, "Betros, Mr. Tannous", "male",20, 0, 0, 2648, 4.0125 ,"","C"]] # 0
#obs = [[3, "Bing, Mr. Lee", "male",32, 0, 0, 1601, 56.4958 ,"","C"]] # 0
#obs = pd.read_csv("../dataset/titanic_data.csv").drop(["boat", "body", "home.dest"], axis = 1)
obs = pd.DataFrame([df2.drop(["boat", "body", "home.dest", "ticket"], axis=1).iloc[8]])

In [32]:
obs

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,cabin,embarked
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,51.4792,C101,S


In [33]:
x = obs.drop(["survived"], axis = 1)
pred = obs["survived"]

In [34]:
print(x.values)
print()
print(pred.values)

[[1 'Appleton, Mrs. Edward Dale (Charlotte Lamson)' 'female' 53.0 2 0
  51.4792 'C101' 'S']]

[1]


In [59]:
X = tfunc.clean_df(obs)

KeyError: 'name'

In [58]:
X.shape

(1, 13)

In [48]:
#X = tfunc.clean_df_one_observation(obs)
X = tfunc.clean_df(obs)
X = scaler.transform(X.values)

y_pred = model.predict(X)
y_pred_proba = model.predict_proba(X)

print("y_pred: {}".format(y_pred[0]))
print("y_pred_proba: {}".format(y_pred_proba))

ValueError: operands could not be broadcast together with shapes (1,13) (27,) (1,13) 

In [25]:
X

array([[-1.55011801,  0.73995717, -0.47953739, -0.44540733, -0.64199223,
        -0.55890642, -0.57207875,  0.81155988, -0.79146312,  1.34789107,
        -1.34789107,  1.95978079, -0.32231217, -1.52502451,  4.55887413,
        -0.49712916, -1.21080045, -0.43876901,  7.64258405, -0.22504019,
        -0.27837711, -0.19099472, -0.17995962, -0.12778773, -0.06196972,
        -0.02767123, -1.86030934]])

In [10]:
a = np.array([[-1.55011801,  0.73995717, -0.47953739, -0.44540733, -0.64199223,
        -0.55890642, -0.57207875,  0.81155988, -0.79146312, -0.74189971,
         0.74189971, -0.51026115, -0.32231217,  0.65572717, -0.2193524 ,
        -0.49712916,  0.82589992, -0.43876901,  7.64258405, -0.22504019,
        -0.27837711, -0.19099472, -0.17995962, -0.12778773, -0.06196972,
        -0.02767123, -1.86030934]])

In [11]:
model.predict(a)

array([0], dtype=int64)

In [12]:
a==X

array([[False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False]])

In [18]:
a[0][0] == X[0][0]

False

In [19]:
a[0][0]

-1.55011801

In [20]:
X[0][0]

-1.5501180117967508

In [23]:
np.around(X,8) == a

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False, False,  True, False, False,  True, False,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True]])