In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression as lr
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

Building my own transformer class for the Titanic dataset

In [3]:
class Transformer(BaseEstimator, ClassifierMixin):

    def fit(self,df):
        df.Age = df.Age.fillna(df.Age.mean())
        self.df = df
        return self

    def transform(self,df):
        """
        1. Extract a copy of data frame with just the "Pclass", "Sex", and "Age" columns
        2. Replace NaN values in the "Age" column (of the copied data frame) with the mean age.
            The mean age of the training data should be calculated in fit() and used in transform() 
            (compare this step to using sklearn.preprocessing.Imputer).
        3. Convert the "Pclass" column datatype to pandas categoricals (pd.CategoricalIndex).
        4. Use pd.get_dummies() to convert the categorical columns to multiple binary columns 
            (compare this step to using sklearn.preprocessing.OneHotEncoder).
        5. Cast the result as a NumPy array and return it. 
            Ensure that your transformer matches scikit-learn conventions 
            (it inherits from the correct base classes, fit() returns self, etc.).
        """
        #df = pd.read_csv("titanic.csv")
        #extract pclass, sex, and age
        self.df = df[["Pclass","Sex","Age"]]
        #fill the NaN values in Age column with the avg
        self.fit(self.df)
        #convert Pclass to pandas categoricals
        self.df.Pclass = self.df.Pclass.astype('category')
        #one-hot encode categorical columns
        self.df = pd.get_dummies(self.df,columns=["Pclass","Sex"],drop_first=True )
        #return as a numpy array
        return np.array(self.df)

In [9]:
df = pd.read_csv("titanic.csv")
df.dropna(subset=["Survived"],inplace=True)
survived = df.Survived
df = df.drop(["Survived"],axis=1)
X_train,X_test,trainY,testY = train_test_split(df,survived,test_size=0.33,random_state=42)
#return y_train
t = Transformer()
trainX = t.transform(X_train)
testX = t.transform(X_test)
clf = lr().fit(trainX,trainY)
logp = clf.predict(testX)
lrScore = clf.score(testX,testY)
clf = rfc().fit(trainX,trainY)
rfp = clf.predict(testX)
rfScore = clf.score(testX,testY)
print(lrScore,rfScore)

0.7986111111111112 0.7569444444444444


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
print(classification_report(testY,logp))
print(classification_report(testY,rfp))

             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       254
        1.0       0.81      0.66      0.73       178

avg / total       0.80      0.80      0.79       432

             precision    recall  f1-score   support

        0.0       0.75      0.87      0.81       254
        1.0       0.76      0.60      0.67       178

avg / total       0.76      0.76      0.75       432



Let's try some Grid Searches!

In [13]:
logGrid = {'solver':['newton-cg','sag','lbfgs'],'C':[0.1,0.5,1]}
logClass = lr()
logGs = GridSearchCV(logClass,logGrid,cv=3,verbose=0)
logGs.fit(trainX,trainY)
print(logGs.best_params_)
print(logGs.best_score_*100)



{'C': 0.1, 'solver': 'sag'}
77.19498289623718




In [16]:
rfGrid = {'n_estimators':[10,20,50,100,150],'max_depth':[None,1,2,10,50]}
rfClass = rfc()
rfGs = GridSearchCV(rfClass,rfGrid,cv=3,verbose=0)
rfGs.fit(trainX,trainY)
print(rfGs.best_params_)
print(rfGs.best_score_*100)

{'max_depth': 1, 'n_estimators': 50}
79.58950969213227


In [35]:
pipeRF = Pipeline([("mm",MinMaxScaler()), 
                 ("scaler",StandardScaler()),
                 ("rf", rfc())])
param_grid = [{"scaler__with_std": [True]}, 
              {"rf__n_estimators": [100,150], "rf__max_depth": [4,5]},
              {"mm__feature_range": [(0,1)]}]
pipe_gs = GridSearchCV(pipeRF, param_grid, cv=3, verbose=1,n_jobs=-1).fit(trainX,trainY)
print(pipe_gs.best_params_,pipe_gs.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Done   3 out of  18 | elapsed:    0.0s remaining:    0.2s


{'rf__max_depth': 5, 'rf__n_estimators': 100} 0.7879133409350056


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:    0.5s finished


In [36]:
pipeLR = Pipeline([("mm",MinMaxScaler()), 
                 ("scaler",StandardScaler()),
                 ("lr", lr())])
param_grid = [{"scaler__with_std": [True]}, 
              {"lr__penalty": ['l1','l2'], "lr__C": [1,4,5]},
              {"mm__feature_range": [(0,1)]}]
pipe_gs = GridSearchCV(pipeLR, param_grid, cv=3, verbose=1,n_jobs=-1).fit(trainX,trainY)
print(pipe_gs.best_params_,pipe_gs.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
{'lr__C': 1, 'lr__penalty': 'l1'} 0.7696693272519954


[Parallel(n_jobs=-1)]: Done   9 out of  24 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.1s finished
