In [52]:
import numpy as np
import sklearn
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import quantile_transform
pd.options.mode.chained_assignment = None  # default='warn'

# Problem 1.
Writing custom scikit-learn transformers is a convenient way to organize the data
cleaning process. Consider the data in titanic.csv, which contains information about passengers on the maiden voyage of the RMS Titanic in 1912. Write a custom transformer class to
clean this data, implementing the transform() method as follows:
1. Extract a copy of data frame with just the "Pclass", "Sex", and "Age" columns.
2. Replace NaN values in the "Age" column (of the copied data frame) with the mean age.
The mean age of the training data should be calculated in fit() and used in transform()
(compare this step to using sklearn.preprocessing.Imputer).
3. Convert the "Pclass" column datatype to pandas categoricals (pd.CategoricalIndex).
4. Use pd.get_dummies() to convert the categorical columns to multiple binary columns
(compare this step to using sklearn.preprocessing.OneHotEncoder).
5. Cast the result as a NumPy array and return it.
Ensure that your transformer matches scikit-learn conventions (it inherits from the correct base
classes, fit() returns self, etc.).

In [2]:
class TitanicTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, return_array=True):
        self.return_array = return_array
        
    def fit(self, X, y=None):
        self.data = X[["Pclass", "Sex", "Age"]]
        self.age_mean = X.Age.mean()
        return self
    
    def transform(self, X):
        X = X[["Pclass", "Sex", "Age"]]
        X.Age = X.Age.fillna(self.age_mean)
        X.Pclass = pd.CategoricalIndex(X.Pclass)
        X = pd.get_dummies(X, columns=["Sex", "Pclass"], drop_first=True)
        if self.return_array:
            return np.array(X)
        else:
            return X

In [3]:
transform = TitanicTransformer()
titanic = pd.read_csv("titanic.csv")
X_train = transform.fit_transform(titanic)
X_train[:5]

array([[29.    ,  0.    ,  0.    ,  0.    ],
       [ 0.9167,  1.    ,  0.    ,  0.    ],
       [ 2.    ,  0.    ,  0.    ,  0.    ],
       [30.    ,  1.    ,  0.    ,  0.    ],
       [25.    ,  0.    ,  0.    ,  0.    ]])

# Problem 2. 
Read the data from titanic.csv with pd.read_csv(). The "Survived" column
indicates which passengers survived, so the entries of the column are the labels that we would
like to predict. Drop any rows in the raw data that have NaN values in the "Survived" column,
then separate the column from the rest of the data. Split the data and labels into training and
testing sets. Use the training data to fit a transformer from Problem 1, then use that transformer
to clean the training set, then the testing set. Finally, train a LogisticRegressionClassifier
and a RandomForestClassifier on the cleaned training data, and score them using the cleaned
test set.

In [4]:
titanic = pd.read_csv("titanic.csv").dropna(subset=["Survived"])
titanic[:5]

Unnamed: 0,Pclass,Survived,Name,Sex,Age,Sibsp,Parch,Ticket,Fare,Cabin,Embarked,Boat,Body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
X = titanic.drop(columns=["Survived"])
y = titanic.Survived.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(981, 13) (981,)
(328, 13) (328,)


In [6]:
tt = TitanicTransformer()
Z_train = tt.fit_transform(X_train)
Z_test = tt.transform(X_test)
Z_train.shape, Z_test.shape

((981, 4), (328, 4))

In [7]:
log_reg = sklearn.linear_model.LogisticRegression()
log_reg.fit(Z_train, y_train)
print("Logistic Regression prediction score")
print(log_reg.score(Z_test, y_test))

Logistic Regression prediction score
0.7774390243902439


In [8]:
rand_frst = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rand_frst.fit(Z_train, y_train)
print("Random Forest prediction score")
print(rand_frst.score(Z_test, y_test))

Random Forest prediction score
0.774390243902439


# Problem 3. 
Use classification_report() to score your classifiers from Problem 2. Next,
do a grid search for each classifier (using only the cleaned training data), varying at least two hyperparameters for each kind of model. Use classification_report() to score the resulting
best estimators with the cleaned test data. Try changing the hyperparameter spaces or scoring
metrics so that each grid search yields a better estimator.

In [9]:
lr_predicted = log_reg.predict(Z_test)
print(classification_report(y_test, lr_predicted))

             precision    recall  f1-score   support

          0       0.77      0.90      0.83       201
          1       0.79      0.58      0.67       127

avg / total       0.78      0.78      0.77       328



In [10]:
rf_predicted = rand_frst.predict(Z_test)
print(classification_report(y_test, rf_predicted))

             precision    recall  f1-score   support

          0       0.74      0.98      0.84       201
          1       0.92      0.46      0.61       127

avg / total       0.81      0.77      0.75       328



In [11]:
log_reg = LogisticRegression()
lr_param_grid = {"penalty": ["l1", "l2"],
                 "C": [0.01, 0.1, 0.5, 1, 2, 10, 100]}
lr_gs = GridSearchCV(log_reg, lr_param_grid, cv=4, scoring="accuracy")
lr_gs.fit(Z_train, y_train)
print(lr_gs.best_params_, lr_gs.best_score_)
lr_gs_predicted = lr_gs.predict(Z_test)
print(classification_report(y_test, lr_gs_predicted))
lr_param_grid = {"C": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
lr_gs = GridSearchCV(log_reg, lr_param_grid, cv=4, scoring="accuracy")
lr_gs.fit(Z_train, y_train)
lr_gs_predicted = lr_gs.predict(Z_test)
print(lr_gs.best_params_, lr_gs.best_score_)
print(classification_report(y_test, lr_gs_predicted))
lr_param_grid = {"C": [0.64 + i/100 for i in range(10)]}
lr_gs = GridSearchCV(log_reg, lr_param_grid, cv=4, scoring="accuracy")
lr_gs.fit(Z_train, y_train)
lr_gs_predicted = lr_gs.predict(Z_test)
print(lr_gs.best_params_, lr_gs.best_score_)
print(classification_report(y_test, lr_gs_predicted))

{'C': 1, 'penalty': 'l1'} 0.7859327217125383
             precision    recall  f1-score   support

          0       0.77      0.90      0.83       201
          1       0.79      0.58      0.67       127

avg / total       0.78      0.78      0.77       328

{'C': 0.3} 0.780835881753313
             precision    recall  f1-score   support

          0       0.78      0.89      0.83       201
          1       0.78      0.60      0.68       127

avg / total       0.78      0.78      0.77       328

{'C': 0.64} 0.780835881753313
             precision    recall  f1-score   support

          0       0.77      0.90      0.83       201
          1       0.79      0.58      0.67       127

avg / total       0.78      0.78      0.77       328



In [12]:
rand_frst = RandomForestClassifier()
rf_param_grid = {"criterion": ["gini", "entropy"],
                 "n_estimators": [10, 50, 100, 150, 200, 250, 300],
                 "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]}
rf_gs = GridSearchCV(rand_frst, rf_param_grid, cv=4, scoring="accuracy")
rf_gs.fit(Z_train, y_train)
print(rf_gs.best_params_, rf_gs.best_score_)
rf_gs_predicted = rf_gs.predict(Z_test)
print(classification_report(y_test, rf_gs_predicted))

{'criterion': 'gini', 'max_depth': 6, 'n_estimators': 200} 0.8012232415902141
             precision    recall  f1-score   support

          0       0.78      0.93      0.85       201
          1       0.84      0.57      0.68       127

avg / total       0.80      0.79      0.78       328



# Problem 4. 
Make a pipeline with at least two transformers to further process the Titanic
dataset. Do a gridsearch on the pipeline and report the hyperparameters of the best estimator.

In [45]:
class AgeToCategorical(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None, boundary_points=[12, 18, 35, 60]):
        self.bp = boundary_points
        self.data = X.Age
        return self
    
    def transform(self, X):
        X.loc[X.Age < self.bp[0], "Age_group"] = "Child"
        X.loc[(X.Age >= self.bp[0]) & (X.Age < self.bp[1]), "Age_group"] = "Teen"
        X.loc[(X.Age >= self.bp[1]) & (X.Age < self.bp[2]), "Age_group"] = "Young Adult"
        X.loc[(X.Age >= self.bp[2]) & (X.Age < self.bp[3]), "Age_group"] = "Middle-aged"
        X.loc[X.Age >= self.bp[3], "Age_group"] = "Senior"
        X = pd.get_dummies(X, columns=["Age_group"], drop_first=True)
        X.drop(columns=["Age"])
        return np.array(X)


In [53]:
pipe = Pipeline([("titanic", TitanicTransformer(return_array=False)),
                 ("age_group", AgeToCategorical()),
                 ("log_reg", LogisticRegression())])
pipe_param_grid = [{"log_reg__penalty": ["l1", "l2"], 
                    "log_reg__C": [1e-2, 1e-1, 1, 1e1, 1e2]}]
pipe_gs = GridSearchCV(pipe, pipe_param_grid, cv=5, scoring="f1", verbose=1).fit(X_train, y_train)
params = pipe_gs.best_params_
print(pipe_gs.best_params_, pipe_gs.best_score_)
print(confusion_matrix(y_test, pipe_gs.predict(X_test)))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


{'log_reg__C': 10.0, 'log_reg__penalty': 'l1'} 0.7150306297107987
[[181  20]
 [ 52  75]]


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    4.9s finished
