In [2]:
!pip install scikit-learn xgboost



In [3]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer , make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import warnings
warnings. filterwarnings('ignore')

In [4]:
df = pd.read_csv('train.csv')
X = df.drop('Survived', axis=1)
y = df.Survived

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
class PrepProcesor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.ageImputer = SimpleImputer()
        self.ageImputer.fit(X[['Age']])
        return self

    def transform(self, X, y=None):
        X['Age'] = self.ageImputer.transform(X[['Age']])
        X['CabinClass'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^a-zA-Z]', '', x))
        X['CabinNumber'] = X['Cabin'].fillna('M').apply(lambda x: str(x).replace(" ", "")).apply(lambda x: re.sub(r'[^0-9]', '', x)).replace('', 0)
        X['Embarked'] = X['Embarked'].fillna('M')
        X = X.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
        return X

In [7]:
preproc = PrepProcesor()
numeric_pipeline = Pipeline([('Scaler', StandardScaler())])
categorical_pipeline = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer([('num', numeric_pipeline, ['Pclass','Age','SibSp','Parch','Fare','CabinNumber']), ('cat', categorical_pipeline, ['Sex','Embarked','CabinClass'])])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10, random_state=42)

In [17]:
X_train

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
165,166,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9.0,0,2,363291,20.5250,,S
541,542,3,"Andersson, Miss. Ingeborg Constanzia",female,9.0,4,2,347082,31.2750,,S
625,626,1,"Sutton, Mr. Frederick",male,61.0,0,0,36963,32.3208,D50,S
388,389,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q
76,77,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S
...,...,...,...,...,...,...,...,...,...,...,...
106,107,3,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,343120,7.6500,,S
270,271,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0000,,S
860,861,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
435,436,1,"Carter, Miss. Lucile Polk",female,14.0,1,2,113760,120.0000,B96 B98,S


In [18]:
mlpipe = Pipeline([('InitialPreproc', PrepProcesor()), ('Transformer',transformer), ('xgb', XGBClassifier())])

In [19]:
mlpipe.fit(X_train,y_train)

In [20]:
y_hat = mlpipe.predict(X_test)

In [21]:
y_hat.shape

(90,)

In [22]:
y_test

Unnamed: 0,Survived
709,1
439,0
840,0
720,1
39,1
...,...
493,0
215,1
309,1
822,0


In [23]:
precision_score(y_test, y_hat)

0.7380952380952381

In [24]:
import joblib

In [25]:
joblib.dump(mlpipe, 'xgbpipe.joblib')

['xgbpipe.joblib']

In [26]:
model = joblib.load('xgbpipe.joblib')

In [27]:
test = pd.read_csv('test.csv')

In [28]:
test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [29]:
yhat = model.predict(test)
yhat

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,