In [158]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Imputer
from sklearn.model_selection import GridSearchCV

In [159]:


from sklearn.base import TransformerMixin, BaseEstimator

class CategoricalTransformer(TransformerMixin, BaseEstimator):
    "Converts a set of columns in a DataFrame to categoricals"
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        'Records the categorical information'
        self.cat_map_ = {col: X[col].astype('category').cat
                         for col in self.columns}
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.Categorical(X[col],
            categories=self.cat_map_[col].categories,
            ordered=self.cat_map_[col].ordered)
        return X

    def inverse_transform(self, trn, y=None):
        trn = trn.copy()
        trn[self.columns] = trn[self.columns].apply(lambda x: x.astype(object))
        return trn
    
class DummyEncoder(TransformerMixin, BaseEstimator):

    def __init__(self, columns= None, drop_first=False):
        self.columns = columns
        self.drop_first = drop_first

        self.columns_ = None
        self.cat_columns_ = None  # type: pd.Index
        self.non_cat_columns_ = None  # type: pd.Index
        self.categories_map_ = None
        self.ordered_map_ = None
        self.cat_blocks_ = None

    def fit(self, X, y=None):
        self.columns_ = X.columns
        if self.columns is None:
            self.cat_columns_ = X.select_dtypes(include=['category']).columns
        else:
            self.cat_columns_ = self.columns
        self.non_cat_columns_ = X.columns.drop(self.cat_columns_)

        self.categories_map_ = {col: X[col].cat.categories
                                for col in self.cat_columns_}
        self.ordered_map_ = {col: X[col].cat.ordered
                             for col in self.cat_columns_}

        left = len(self.non_cat_columns_)
        self.cat_blocks_ = {}
        for col in self.cat_columns_:
            right = left + len(X[col].cat.categories)
            self.cat_blocks_[col] = slice(left, right)
            left = right
        return self

    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            return pd.get_dummies(X, drop_first=self.drop_first)
        elif isinstance(X, dd.DataFrame):
            return X.map_partitions(pd.get_dummies, drop_first=self.drop_first)
        else:
            raise TypeError

    def inverse_transform(self, X):
        print len(self.non_cat_columns_)
        non_cat = pd.DataFrame(X.iloc[:,:len(self.non_cat_columns_)], columns=self.non_cat_columns_)
        cats = []
        for col in self.cat_columns_:
            slice_ = self.cat_blocks_[col]
            categories = self.categories_map_[col]
            ordered = self.ordered_map_[col]
            print slice

            codes = X.iloc[:, slice_].as_matrix().argmax(1)
            series = pd.Series(pd.Categorical.from_codes(
                codes, categories, ordered=ordered
            ), name=col)
            cats.append(series)
        df = pd.concat([non_cat] + cats, axis=1)[self.columns_]
        return df


In [160]:
titanic = pd.read_csv("dataset/titanic.csv")
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [161]:
X_train, X_test, y_train, y_test = train_test_split(
    titanic[['Pclass', "Sex", "Embarked", "Age", "SibSp","Parch"]],
    titanic[["Survived"]], test_size=0.2, random_state=42)

In [162]:
print(len(X_train))
print(len(y_train))

print(len(X_test))
print(len(y_test))

712
712
179
179


In [163]:
X_train.head()

Unnamed: 0,Pclass,Sex,Embarked,Age,SibSp,Parch
331,1,male,S,45.5,0,0
733,2,male,S,23.0,0,0
382,3,male,S,32.0,0,0
704,3,male,S,26.0,1,0
813,3,female,S,6.0,4,2


In [164]:
y_train.head()

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0


In [165]:
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0,Pclass,Sex,Embarked,Age,SibSp,Parch
29,3,male,S,,0,0
55,1,male,S,,0,0
533,3,female,C,,0,2
495,3,male,C,,0,0
468,3,male,Q,,0,0
409,3,female,S,,3,1
425,3,male,S,,0,0
101,3,male,S,,0,0
298,1,male,S,,0,0
260,3,male,Q,,0,0


In [174]:
# Create a dictionary containing all the candidate values of the parameters
parameter_grid = dict(imputer__strategy=["most_frequent", "mean"]
    ,randomforestclassifier__criterion=['gini','entropy']
    ,randomforestclassifier__max_features=['auto',None, "log2"]
                     )

X_train, X_test, y_train, y_test = train_test_split(
    titanic[['Pclass', "Sex", "Embarked", "Age","SibSp","Parch"]],
    titanic[["Survived"]], test_size=0.2, random_state=42)

categoricalColumns = ['Pclass', "Sex", "Embarked"]
pipe = make_pipeline(CategoricalTransformer(categoricalColumns)
                    , DummyEncoder()
                    ,Imputer(missing_values=np.nan, strategy='most_frequent', axis=0)
                     ,RandomForestClassifier()
                    )
print pipe 
grid = GridSearchCV(estimator=pipe, param_grid =parameter_grid )
grid.fit(X_train, y_train.values.ravel())
#t = pipe.transform(X_train)
#t

Pipeline(memory=None,
     steps=[('categoricaltransformer', CategoricalTransformer(columns=['Pclass', 'Sex', 'Embarked'])), ('dummyencoder', DummyEncoder(columns=None, drop_first=False)), ('imputer', Imputer(axis=0, copy=True, missing_values=nan, strategy='most_frequent',
    verbose=0)), ('randomforestclassifier', RandomFor...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('categoricaltransformer', CategoricalTransformer(columns=['Pclass', 'Sex', 'Embarked'])), ('dummyencoder', DummyEncoder(columns=None, drop_first=False)), ('imputer', Imputer(axis=0, copy=True, missing_values=nan, strategy='most_frequent',
    verbose=0)), ('randomforestclassifier', RandomFor...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'imputer__strategy': ['most_frequent', 'mean'], 'randomforestclassifier__criterion': ['gini', 'entropy'], 'randomforestclassifier__max_features': ['auto', None, 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [175]:
print grid.best_estimator_.named_steps["randomforestclassifier"].criterion
print grid.best_estimator_.named_steps["imputer"].strategy
print grid.best_estimator_.named_steps["randomforestclassifier"].max_features

entropy
mean
auto


In [168]:
grid.cv_results_

{'mean_fit_time': array([ 0.08500004,  0.07100002,  0.06100011,  0.06566676]),
 'mean_score_time': array([ 0.01600003,  0.01400002,  0.01366655,  0.012     ]),
 'mean_test_score': array([ 0.78230337,  0.79073034,  0.78370787,  0.77949438]),
 'mean_train_score': array([ 0.92697165,  0.92977719,  0.92766896,  0.9304819 ]),
 'param_imputer__strategy': masked_array(data = ['most_frequent' 'most_frequent' 'mean' 'mean'],
              mask = [False False False False],
        fill_value = ?),
 'param_randomforestclassifier__criterion': masked_array(data = ['gini' 'entropy' 'gini' 'entropy'],
              mask = [False False False False],
        fill_value = ?),
 'params': [{'imputer__strategy': 'most_frequent',
   'randomforestclassifier__criterion': 'gini'},
  {'imputer__strategy': 'most_frequent',
   'randomforestclassifier__criterion': 'entropy'},
  {'imputer__strategy': 'mean', 'randomforestclassifier__criterion': 'gini'},
  {'imputer__strategy': 'mean',
   'randomforestclassifier__cr

In [169]:
predicted = grid.predict(X_test)

In [170]:
print("Classification report for classifier %s:\n%s\n"
      % (grid, metrics.classification_report(y_test, predicted)))

Classification report for classifier GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('categoricaltransformer', CategoricalTransformer(columns=['Pclass', 'Sex', 'Embarked'])), ('dummyencoder', DummyEncoder(columns=None, drop_first=False)), ('imputer', Imputer(axis=0, copy=True, missing_values=nan, strategy='most_frequent',
    verbose=0)), ('randomforestclassifier', RandomFor...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'imputer__strategy': ['most_frequent', 'mean'], 'randomforestclassifier__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0):
             precision    recall  f1-score   support

          0       0.82      0.88      0.85       105
          1       0.81      0.73      0.77        74

avg / total       0.82      0.82     

In [171]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, predicted))

Confusion matrix:
[[92 13]
 [20 54]]


In [172]:
print("Acurracy:\n%s" % metrics.classification.accuracy_score(y_test, predicted))

Acurracy:
0.815642458101


In [173]:
help(RandomForestClassifier)

Help on class RandomForestClassifier in module sklearn.ensemble.forest:

class RandomForestClassifier(ForestClassifier)
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and use averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is always the same as the original
 |  input sample size but the samples are drawn with replacement if
 |  `bootstrap=True` (default).
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : integer, optional (default=10)
 |      The number of trees in the forest.
 |  
 |  criterion : string, optional (default="gini")
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |      Note: this parameter is tree-specific.
 |  
 |  max_features : int, fl