In [59]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
SEED = 12
# sharper plots
%config InlineBackend.figure_format = 'retina'

In [92]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score


class RandomForestClassifierCustom(BaseEstimator):
    def __init__(
        self, n_estimators=10, max_depth=10, max_features=10, random_state=SEED
    ):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state

        self.trees = []
        self.feat_ids_by_tree = []

    def fit(self, X, y):
        self.classes_ = sorted(np.unique(y))
        
        for i in range(self.n_estimators):

            np.random.seed(self.random_state + i)
            feat_to_use_ids = np.random.choice(
                range(X.shape[1]), self.max_features, replace=False
            )
            examples_to_use = list(
                set(np.random.choice(range(X.shape[0]), X.shape[0], replace=True))
            )

            self.feat_ids_by_tree.append(feat_to_use_ids)

            dt = DecisionTreeClassifier(
                max_depth=self.max_depth,
                max_features=self.max_features,
                random_state=self.random_state,
            )

            dt.fit(X[examples_to_use, :][:, feat_to_use_ids], y[examples_to_use])
            self.trees.append(dt)
        return self

    def predict_proba(self, X):
        predictions = []
        for i in range(self.n_estimators):
            feat_to_use_ids = self.feat_ids_by_tree[i]
            predictions.append(self.trees[i].predict_proba(X[:, feat_to_use_ids]))
        return np.mean(predictions, axis=0)
    
    def predict(self, X):
        predictions = []
        for i in range(self.n_estimators):
            feat_to_use_ids = self.feat_ids_by_tree[i]
            predictions.append(self.trees[i].predict(X[:, feat_to_use_ids]))
        return np.mean(predictions, axis=0)

In [61]:
PATH_TO_DATA = Path("C:/Users/Sangua Terra/Downloads/mlcourse_ai_bonus_assignments/mlcourse_ai_bonus_jupyter_book/_static/data/assignment5/")
data = pd.read_csv(PATH_TO_DATA / "credit_scoring_sample.csv.zip", sep=";")

In [62]:
def fillna_median(df):
    for i in df.columns:
        df[i] = df[i].fillna(df[i].median())
    return df

In [63]:
table = fillna_median(data)

In [82]:
independent_columns_names = data.columns.values
independent_columns_names = [x for x in data if x != "SeriousDlqin2yrs"]
X = table[independent_columns_names]
y = table["SeriousDlqin2yrs"]
index = int(0.8 * X.shape[0])
X_train = X[:index]
y_train = y[:index]
X_test = X[index:]
y_test = y[index:]

In [86]:
y_test.shape

(9013,)

In [93]:
%%time
rfc = RandomForestClassifierCustom(max_depth=7, max_features=6)
rfc.fit(X.values,y.values)
y_pred = rfc.predict(X_test.values)


CPU times: total: 969 ms
Wall time: 990 ms


In [94]:
y_pred

array([0.9, 0.8, 0. , ..., 0. , 0. , 0.6])

In [95]:
from sklearn.metrics import roc_auc_score

In [96]:
roc_auc_score(y_test,y_pred)

0.7642221633807056