<a href="https://colab.research.google.com/github/rohit139/educationLanes_Classes/blob/master/EducationLanes_RandomForest_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%reset -f

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [0]:
# 1.3 Class for applying multiple data transformation jobs
from sklearn.compose import ColumnTransformer as ct

# 1.4 Scale numeric data
from sklearn.preprocessing import StandardScaler as ss

# 1.5 One hot encode data--Convert to dummy
from sklearn.preprocessing import OneHotEncoder as ohe

# 1.6 for data splitting
from sklearn.model_selection import train_test_split

In [0]:
# 1.7 Modeler
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# User guide: https://scikit-learn.org/stable/modules/tree.html
from sklearn.tree import DecisionTreeClassifier as dt

In [0]:
# 1.9 Kill warnings
import warnings
warnings.filterwarnings("ignore")

In [0]:
german = pd.read_csv("https://raw.githubusercontent.com/rohit139/educationLanes_Classes/master/dataFiles/german_credit.csv")

In [8]:
german.head(5)

Unnamed: 0,creditability,account_balance,previous_credit_payment_status,credit_duration_month,purpose_of_earlier_credit,credit_amount,installment_percent,current_emploment_length,sex_and_marital_status,guarantors,duration_at_current_address,most_valuable_available_asset,age,concurrent_credits,apartmenttype,howmanycreditsat_this_bank,occupation,dependents,telephone,foreign_worker
0,1,1,4,18,2,1049,4,2,2,1,4,2,21,3,1,1,3,1,1,1
1,1,1,4,9,0,2799,2,3,3,1,2,1,36,3,1,2,3,2,1,1
2,1,2,2,12,9,841,2,4,2,1,4,1,23,3,1,1,2,1,1,1
3,1,1,4,12,0,2122,3,3,3,1,2,1,39,3,1,2,2,2,1,2
4,1,1,4,12,0,2171,4,3,3,1,4,2,38,1,2,2,2,1,1,2


In [9]:
german.shape

(1000, 20)

In [10]:
german.columns

Index(['creditability', 'account_balance', 'previous_credit_payment_status',
       'credit_duration_month', 'purpose_of_earlier_credit', 'credit_amount',
       'installment_percent', 'current_emploment_length',
       'sex_and_marital_status', 'guarantors', 'duration_at_current_address',
       'most_valuable_available_asset', 'age', 'concurrent_credits',
       'apartmenttype', 'howmanycreditsat_this_bank', 'occupation',
       'dependents', 'telephone', 'foreign_worker'],
      dtype='object')

In [12]:
german.dtypes

creditability                     int64
account_balance                   int64
previous_credit_payment_status    int64
credit_duration_month             int64
purpose_of_earlier_credit         int64
credit_amount                     int64
installment_percent               int64
current_emploment_length          int64
sex_and_marital_status            int64
guarantors                        int64
duration_at_current_address       int64
most_valuable_available_asset     int64
age                               int64
concurrent_credits                int64
apartmenttype                     int64
howmanycreditsat_this_bank        int64
occupation                        int64
dependents                        int64
telephone                         int64
foreign_worker                    int64
dtype: object

In [13]:
german.dtypes.value_counts()

int64    20
dtype: int64

In [14]:
# 3.3 Shuffle data
german = german.sample(frac = 1)
german.tail()

Unnamed: 0,creditability,account_balance,previous_credit_payment_status,credit_duration_month,purpose_of_earlier_credit,credit_amount,installment_percent,current_emploment_length,sex_and_marital_status,guarantors,duration_at_current_address,most_valuable_available_asset,age,concurrent_credits,apartmenttype,howmanycreditsat_this_bank,occupation,dependents,telephone,foreign_worker
877,0,1,2,48,3,6758,3,3,2,1,2,3,31,3,2,1,3,1,2,1
107,1,4,2,18,3,1473,3,2,4,1,4,1,39,3,2,1,3,1,2,1
773,0,1,2,36,0,9271,2,4,3,1,1,3,24,3,2,1,3,1,2,1
518,0,4,2,24,3,5943,1,2,2,1,1,3,44,3,2,2,3,1,2,1
618,0,1,2,30,2,3108,2,2,1,1,4,2,31,3,2,1,2,1,1,1


In [15]:
german.shape

(1000, 20)

In [0]:
# 4.0 Create some new variables
#     from 'age' and 'credit_amount'

german['age_cat'] = pd.cut(german['age'], 3, labels=["0","1","2"])                      # Equal range cut
german['age_qcat'] = pd.qcut(german['age'], 3, labels=["0","1","2"])                    # Equal freq cut


# 4.1
german['credit_amount_cat'] = pd.cut(german['credit_amount'], 3, labels=["0","1","2"])
german['credit_amount_qcat'] = pd.cut(german['credit_amount'], 3, labels=["0","1","2"])

In [17]:
# 4.2 Separate predictors and target
# 4.3 Popup target
y = german.pop('creditability')
y[:3]                 # Pandas Series


79     1
156    1
744    1
Name: creditability, dtype: int64

In [18]:
# 4.1 Remaining dataframe only has predictors
#     Create an alias of german
X = german
X is german      # X is same as 'german'

True

In [19]:
### 4.2
###    We now want to know which of the columns are categorical
###    but disguised as integers


# 4.3 How many unique vales per column.
#     Check every column
#     We will assume that if unique values are 4 or less
#     it is categorical column else numeric
X.nunique()
X.nunique() < 5    # All True are categorical

account_balance                    True
previous_credit_payment_status    False
credit_duration_month             False
purpose_of_earlier_credit         False
credit_amount                     False
installment_percent                True
current_emploment_length          False
sex_and_marital_status             True
guarantors                         True
duration_at_current_address        True
most_valuable_available_asset      True
age                               False
concurrent_credits                 True
apartmenttype                      True
howmanycreditsat_this_bank         True
occupation                         True
dependents                         True
telephone                          True
foreign_worker                     True
age_cat                            True
age_qcat                           True
credit_amount_cat                  True
credit_amount_qcat                 True
dtype: bool

In [20]:
# 4.3.1 Set difference operation
X.columns

Index(['account_balance', 'previous_credit_payment_status',
       'credit_duration_month', 'purpose_of_earlier_credit', 'credit_amount',
       'installment_percent', 'current_emploment_length',
       'sex_and_marital_status', 'guarantors', 'duration_at_current_address',
       'most_valuable_available_asset', 'age', 'concurrent_credits',
       'apartmenttype', 'howmanycreditsat_this_bank', 'occupation',
       'dependents', 'telephone', 'foreign_worker', 'age_cat', 'age_qcat',
       'credit_amount_cat', 'credit_amount_qcat'],
      dtype='object')

In [0]:
r = ['account_balance', 'previous_credit_payment_status', 'credit_duration_month',
     'purpose_of_earlier_credit', 'credit_amount']

In [22]:
set(X.columns).difference(set(r))

{'age',
 'age_cat',
 'age_qcat',
 'apartmenttype',
 'concurrent_credits',
 'credit_amount_cat',
 'credit_amount_qcat',
 'current_emploment_length',
 'dependents',
 'duration_at_current_address',
 'foreign_worker',
 'guarantors',
 'howmanycreditsat_this_bank',
 'installment_percent',
 'most_valuable_available_asset',
 'occupation',
 'sex_and_marital_status',
 'telephone'}

In [0]:
# 4.4 Define a function to separate out categorical/numerical columns
def sep_Cat_Num_columns(dx):
    """
    Takes as input DataFrame
    Returns two lists:
      i)  One of categorical columns
      ii) Other of remaining numerical columns
    """
    # 4.4.1
    cat = dx.nunique() < 5                         # Will give True/False
    # 4.4.2
    categorical_columns = dx.loc[: , cat].columns  # List of cat columns
    # 4.4.3 Remaining are numeric columns
    numerical_columns = set(dx.columns).difference(set(categorical_columns))
    # 4.4.4 Return a list of both columns
    return list(categorical_columns),list(numerical_columns)

In [24]:
# 5.0 Get the columns now
sep_Cat_Num_columns.__doc__

'\n    Takes as input DataFrame\n    Returns two lists:\n      i)  One of categorical columns\n      ii) Other of remaining numerical columns\n    '

In [25]:
# 5.1
categorical_columns,numerical_columns = sep_Cat_Num_columns(X)
categorical_columns
numerical_columns

['current_emploment_length',
 'previous_credit_payment_status',
 'age',
 'credit_amount',
 'purpose_of_earlier_credit',
 'credit_duration_month']

In [26]:
categorical_columns

['account_balance',
 'installment_percent',
 'sex_and_marital_status',
 'guarantors',
 'duration_at_current_address',
 'most_valuable_available_asset',
 'concurrent_credits',
 'apartmenttype',
 'howmanycreditsat_this_bank',
 'occupation',
 'dependents',
 'telephone',
 'foreign_worker',
 'age_cat',
 'age_qcat',
 'credit_amount_cat',
 'credit_amount_qcat']

In [0]:
###########################
## Process/standardise data
###########################

#### Data Processing and Modeling: Simple Expt first

In [28]:
# 6.0 Create a small DataFrame with two categorical and two numeric variables
dk = pd.DataFrame({'cat':    ['h', 'h', 'l', 'm', 'l', 'm'],
                   'store' : ['a', 'a', 'b' ,'b', 'a','b'],
                   'price' : [2,3,5,9,10,11],
                   'qty'   : [100,200,400,800,900,900]
                   })
dk

Unnamed: 0,cat,store,price,qty
0,h,a,2,100
1,h,a,3,200
2,l,b,5,400
3,m,b,9,800
4,l,a,10,900
5,m,b,11,900


In [29]:
# 8.1 OneHotEncode categorical variables
onehot = ohe(sparse = False)               # Create object instance
onehot.fit(dk[['cat', 'store']])           # Learn data
t = onehot.transform(dk[['cat','store']])  # Now transform
t

array([[1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1.]])

In [30]:
# 8.2 Scale numeric data
scaleit = ss()
scaleit.fit(dk[['price','qty']])
scaleit.transform(dk[['price','qty']])

array([[-1.33484762, -1.36196984],
       [-1.04880885, -1.05930987],
       [-0.47673129, -0.45398995],
       [ 0.66742381,  0.75664991],
       [ 0.95346259,  1.05930987],
       [ 1.23950137,  1.05930987]])

In [0]:
# 8.3 Columnar transformer: Two-in-one
#     Use both ohe and scaler in a composite operation

# 8.4 Define operations to perform and on which columns
#     Format: (name, transformer, columns)

op1 = ("cat_col", ohe(sparse = False), ['cat', 'store'])
op2 = ("num_col", ss(), ['price','qty'])

In [32]:
op1

('cat_col', OneHotEncoder(categorical_features=None, categories=None,
        dtype=<class 'numpy.float64'>, handle_unknown='error',
        n_values=None, sparse=False), ['cat', 'store'])

In [33]:
op2

('num_col',
 StandardScaler(copy=True, with_mean=True, with_std=True),
 ['price', 'qty'])

In [0]:
# 8.5 Create column-transformer object to perform these operations
#     The object contains one list of operations
col_transformer = ct([op1,op2])     # Instaniation

In [35]:
col_transformer

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat_col', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=False), ['cat', 'store']), ('num_col', StandardScaler(copy=True, with_mean=True, with_std=True), ['price', 'qty'])])

In [36]:
# 8.6 Fit and transform now
col_transformer.fit(dk)             # Learn data (dk)
u = col_transformer.transform(dk)   # Transformation of dk
u

array([[ 1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        -1.33484762, -1.36196984],
       [ 1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        -1.04880885, -1.05930987],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        -0.47673129, -0.45398995],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.66742381,  0.75664991],
       [ 0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.95346259,  1.05930987],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         1.23950137,  1.05930987]])

In [0]:
########### Expt finished. Now create function

# 9.0 Following function does all the above
def transform(categorical_columns,numerical_columns,X):
    #  Create a tuple of processing tasks:
    #  (taskName, objectToPerformTask, columns-upon-which-to-perform)
    # 9.1 One hot encode categorical columns
    cat = ('categorical', ohe() , categorical_columns  )
    # 9.2 Scale numerical columns
    num = ('numeric', ss(), numerical_columns)
    # 9.3 Instantiate columnTransformer object to perform task
    #     It transforms X separately by each transformer
    #     and then concatenates results.
    col_trans = ct([cat, num])
    # 9.4 Learn data
    col_trans.fit(X)
    # 9.5 Now transform X
    X_tarnsAndScaled = col_trans.transform(X)
    # 9.6 Return transformed data and also transformation object
    return X_tarnsAndScaled, col_trans

In [0]:
# 10.0 Transform both datasets
X_tarnsAndScaled, _  = transform(categorical_columns, numerical_columns, X)

In [39]:
# 10.1
X_tarnsAndScaled.shape 

(1000, 61)

In [40]:
X_tarnsAndScaled[:5, :3]

array([[0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 0., 0.]])

In [41]:
X_tarnsAndScaled[:5,58:] 

array([[-0.45802632,  0.06270354, -0.48976238],
       [-0.62851269,  0.06270354, -0.73866754],
       [ 0.54575201, -0.66640737,  0.75476341],
       [-0.89044496, -1.03096283, -0.73866754],
       [-0.8301899 , -0.30185192,  0.25695309]])

In [0]:
###########################
## Split and model
###########################

# 11 Split into train and test datasets AS ALSO GET INDICIES
X_train,X_test, y_train, y_test = train_test_split(
                                                    X_tarnsAndScaled,    # Predictors
                                                    y,                   # Target
                                                    test_size = 0.3      # split-ratio
                                                    )

In [43]:
# 11.2 Check the splits
print(X_train.shape)       # 700 X 61
print(X_test.shape)        # 300 X 61
print(y_train.shape)       # (700,)
print(y_test.shape)        # (300,)

(700, 61)
(300, 61)
(700,)
(300,)


In [0]:
# 12.0 Modeling
# 12.1 Instantiate decision tree modeling object
clf = dt()            # Accept all default parameters

In [45]:
# 12.2 Training
clf.fit(X_train,y_train)   # Train now

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [0]:
# 12.3 Prediction
out = clf.predict(X_test)

In [47]:
# 12.4 Accuracy?
np.sum(out == y_test)/y_test.values.size

0.6633333333333333

In [0]:
### 13.0 Column importance
# 13 Which columns are important
impt = clf.feature_importances_

In [49]:
impt

array([0.02509363, 0.01342604, 0.00610537, 0.10017415, 0.00790178,
       0.00984576, 0.02756398, 0.01406537, 0.00524545, 0.00831633,
       0.02716608, 0.00799169, 0.00600886, 0.00423559, 0.00230597,
       0.01661001, 0.0266054 , 0.0095474 , 0.00447535, 0.        ,
       0.01291731, 0.00736688, 0.        , 0.        , 0.        ,
       0.00661476, 0.00447535, 0.        , 0.00812095, 0.00447535,
       0.00671302, 0.0113924 , 0.        , 0.        , 0.01359258,
       0.00447535, 0.01309039, 0.00575402, 0.01564373, 0.00447535,
       0.00864381, 0.        , 0.00569118, 0.00335651, 0.00906257,
       0.        , 0.01208743, 0.00584445, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.07297984, 0.04984173, 0.08640535, 0.17533296, 0.03719571,
       0.09176682])

In [0]:

cols = categorical_columns + numerical_columns

In [51]:
cols

['account_balance',
 'installment_percent',
 'sex_and_marital_status',
 'guarantors',
 'duration_at_current_address',
 'most_valuable_available_asset',
 'concurrent_credits',
 'apartmenttype',
 'howmanycreditsat_this_bank',
 'occupation',
 'dependents',
 'telephone',
 'foreign_worker',
 'age_cat',
 'age_qcat',
 'credit_amount_cat',
 'credit_amount_qcat',
 'current_emploment_length',
 'previous_credit_payment_status',
 'age',
 'credit_amount',
 'purpose_of_earlier_credit',
 'credit_duration_month']

In [52]:
# 13.1 Sort out zip
sorted(zip(impt,cols), reverse = True)

[(0.1001741543145696, 'guarantors'),
 (0.027563981209995973, 'concurrent_credits'),
 (0.027166076594158945, 'dependents'),
 (0.02660539657718406, 'credit_amount_qcat'),
 (0.025093627816674423, 'account_balance'),
 (0.016610013199244617, 'credit_amount_cat'),
 (0.01406537201310637, 'apartmenttype'),
 (0.013426036921601535, 'installment_percent'),
 (0.012917314389328747, 'credit_amount'),
 (0.009845760409174457, 'most_valuable_available_asset'),
 (0.00954740403313887, 'current_emploment_length'),
 (0.008316326138721084, 'occupation'),
 (0.007991688643810438, 'telephone'),
 (0.007901782146567565, 'duration_at_current_address'),
 (0.007366883895294351, 'purpose_of_earlier_credit'),
 (0.006105374548400703, 'sex_and_marital_status'),
 (0.006008855685192294, 'foreign_worker'),
 (0.005245453818937399, 'howmanycreditsat_this_bank'),
 (0.004475345640533845, 'previous_credit_payment_status'),
 (0.0042355949812195315, 'age_cat'),
 (0.002305965432314057, 'age_qcat'),
 (0.0, 'credit_duration_month')

In [0]:
# 14.5 Instantiate modeler class
clf = dt(min_samples_leaf = 5)    # Change number of data-points on leaf


In [55]:
# 14.1 Train and develop model
clf.fit(X_train,y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [0]:
# 14.2 Predict
out = clf.predict(X_test)


In [59]:
np.sum(out == y_test)/y_test.values.size   # No change

0.6866666666666666

In [0]:
###################### Random Forest ############################

from sklearn.ensemble import RandomForestClassifier #use RandomForestRegressor for regression problem
# Create Random Forest object
model_rf= RandomForestClassifier(n_estimators=100)


In [61]:
# Train the model using the training sets and check score
model_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [0]:
#Predict Output
predicted= model_rf.predict(X_test)

In [63]:
np.sum(predicted == y_test)/y_test.values.size  

0.7833333333333333