In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import label_binarize
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import MultiLabelBinarizer
import numpy.testing as test
import re
import warnings 
from sklearn.preprocessing import FunctionTransformer, StandardScaler, Normalizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore", category=FutureWarning) 

%matplotlib inline

class DataFrameSelector(BaseEstimator, TransformerMixin) :    
    def __init__(self, columns):
        self.columns=columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.columns].values

In [6]:

class NameEncoder(BaseEstimator, TransformerMixin) :    
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        nameDict={1 : ['Mr'], 
          2 : ['Miss', 'Mlle'],
          3 : ['Mrs', 'Mme'],
          4 : ['Dr'],
          5 : ['Col', 'Major', 'Capt'],
          6 : ['Countess', 'Sir', 'Lady', 'Don'],
          7 : ['Master', 'Jonkheer']
        }
        x = np.array( [re.split(',|\.', p)[1].strip() for p in X] )
        x1= np.zeros(x.shape[0])

        for key in nameDict:
            for f in nameDict[key]:   
                idx = 0
                for t in x:
                    if (t == f):
                        x1[idx]=key
                    idx = idx + 1
                        
        return x1.astype(int).reshape(-1,1)
    @staticmethod
    def test():
        nameEncoder = NameEncoder()
        res = nameEncoder.fit_transform(np.array([['p, Mr Jimmy'], ['x, Mrs. Smith'], ['y, Mr. Smith']]))
#         test.assert_array_equal(res, [[0],[2],[3]])
        print(res)
       
# ne = NameEncoder()
# x = DataFrameSelector('Name').fit_transform(df)
# f = ne.fit_transform(x)
# # print(f[0:20], f.shape)
# # print(df['Name'][0:20])

# mu = make_union(
#     make_pipeline(DataFrameSelector('Name'), NameEncoder()),
#     make_pipeline(DataFrameSelector(['Age']), StandardScaler()) ) # union

# x = mu.fit_transform(df)

# np.unique(x[:,0])


In [52]:
# age imputer expects an 2D array having on the first column
# the encoded name 0,1,2,3,4 and on the second column the age
class AgeImputer(BaseEstimator, TransformerMixin) :        
    # fit will calculate the averages per encoded name
    def fit(self, X, y=None):
        self.titleAgeDictionary = {}
        for key in np.arange(0, 8):
            try: # just to make sure for unit tests add a try/catch
                self.titleAgeDictionary[key] = np.nanmedian(X[X[:,0] == key, 1])
                if (key == 2):
                    self.titleAgeDictionary[key]=9
            except:
                print('Fault')
                pass
        print(self.titleAgeDictionary)
        return self

    def transform(self, X, y=None):        
        for key in self.titleAgeDictionary.keys():
            X[(X[:,0] == key) & (np.isnan(X[:,1])),1]=self.titleAgeDictionary[key]
        return X

    def test(self):
        X=np.array([[1,12],
          [2,12],
          [1, np.NaN]])
        res = self.fit_transform(X)


In [83]:
# read the dataframe
df = pd.read_csv('train.csv')

def hasCabin(x):
     return np.apply_along_axis(lambda t : 0 if ( t != t) else 1 ,arr=x ,axis=1)

y = df['Survived'].values

df.drop(labels=['Survived'], axis=1, inplace=True)

pipeline = make_column_transformer(
    (['Name', 'Age'], make_pipeline(
                        make_union(
                            make_pipeline(DataFrameSelector('Name'), NameEncoder()),
                            make_pipeline(DataFrameSelector(['Age'])
                                          , StandardScaler()
                                         ) 
                        ), # union
                       AgeImputer(), 
                       make_column_transformer(([0], OneHotEncoder(sparse=False)), remainder='passthrough' )
                    ) # pipeline age
    ), #tuple
    (['Sex', 'Pclass'], OneHotEncoder(sparse=False)),
    (['Embarked'], make_pipeline( SimpleImputer( strategy='constant', fill_value='S' ), OneHotEncoder(sparse=False))), 
    (['Fare'], make_pipeline( SimpleImputer(strategy='mean'), FunctionTransformer( lambda x : np.log2(x, where=x>0)), StandardScaler() )),
#     (['Cabin'], make_pipeline( FunctionTransformer(hasCabin, validate=False), OneHotEncoder(sparse=False)) ), 
    (['SibSp', 'Parch'], make_pipeline(DataFrameSelector(['SibSp','Parch']), 
                                       FunctionTransformer(lambda x : (x[:,0] + x[:,1]).reshape(-1,1)))), 
    (['SibSp', 'Parch'], make_pipeline(DataFrameSelector(['SibSp','Parch']), 
                                       FunctionTransformer(lambda x : (x[:,0] + x[:,1] == 0).reshape(-1,1)))), 
    (['Age'], make_pipeline( FunctionTransformer(lambda x : x < 14, validate=False), OneHotEncoder(sparse=False)) ), 
    (['SibSp', 'Parch'], 'passthrough'),    
    (['PassengerId', 'Cabin'], 'drop'),
    )
x = pipeline.fit_transform(df)

print(x.shape)
x

{0: 0.5373870054192047, 1: 0.020727176659648322, 2: 9, 3: 0.36516706249935255, 4: 1.1573787999306722, 5: 1.8118145830261103, 6: 1.2607107656825836, 7: -1.7703602297068137}
(891, 24)


array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 1., 2.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [82]:

lr =LogisticRegression();

# forest_cv = GridSearchCV(estimator=lr,  param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, cv=5) 
# print(forest_cv.get_params())
# s = cross_val_score(lr, x, y, scoring='neg_mean_squared_error', cv=4)
# scores=np.log(-s)
# print(scores)
# print('Mean=', scores.mean())
# print('Standard Dev=', scores.std())



# check by getting 80/20
num_train=int(x.shape[0]*0.8)
x_train=x[:num_train,:]
y_train=y[:num_train]

x_test=x[num_train:,:]
y_test=y[num_train:]

print('.... fitting')
lr.fit(x_train, y_train)
print('.... predicting')

# print(forest_cv.best_params_) 

y_predict = lr.predict(x_test)

# # print(y_predict[:20])
# # print(y_test[:20])

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
# confusion_matrix(y_test, y_predict)
prec_score = precision_score(y_test, y_predict)
rec_score = recall_score(y_test, y_predict)
acc_score = accuracy_score(y_test, y_predict)
f1_sc = f1_score(y_test, y_predict)
print("Accuracy Score=", acc_score)
print("Precision Score=", prec_score)
print("Recall Score=", rec_score)
print("F1 Score=", f1_sc)

.... fitting
.... predicting
Accuracy Score= 0.8715083798882681
Precision Score= 0.8253968253968254
Recall Score= 0.8125
F1 Score= 0.8188976377952756


In [80]:
df = pd.read_csv('test.csv')
x1 = pipeline.transform(df)

y_predict = lr.predict(x1)

kaggle = pd.DataFrame({'PassengerId': df['PassengerId'], 'Survived': y_predict})

# save to csv
kaggle.to_csv('titanic_pred.csv', index=False)


In [None]:
import re
d = DataFrameSelector(['Name'])
x1 = d.fit_transform(df)
# x1.reshape(-1,1)
# x1=np.array(['Ala', 'Bala'])

def f(p) :    
    return re.split(',|\.', p)[1].strip()
x = np.array( [re.split(',|\.', p)[1].strip() for p in x1[:,0]] )
x
# x = df['Name'].str.extract(r',(.+)\.')
# x
# nameDict={'Mr' : ['Mr'], 
#           'Miss' : ['Miss', 'Mlle'],
#           'Mrs' : ['Mrs', 'Mme'],
#           'Dr' : ['Dr'],
#           'Col' : ['Col', 'Major', 'Capt'],
#           'Sir' : ['Countess', 'Sir', 'Lady', 'Don'],
#           'Master' : ['Master', 'Jonkheer']
#          }


# for f in nameDict:
#     print(f)


In [None]:
X= np.array(['Mr', 'Miss', 'Moss', 'Mr', 'Pula'])

from sklearn.preprocessing import LabelEncoder

l = LabelEncoder()

l.fit_transform(X)

In [None]:
x = df[(df['Name'].str.contains('Master')) & (df['Parch']>0)]
x = df[(df['Parch']>0)]
x['Age'].median()