# Random Forests

In [1]:
#Import all required libraries for reading data, analysing and visualizing data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from pandas.api.types import is_string_dtype, is_numeric_dtype
import math
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix


# Data

In [116]:
mtrain = pd.read_csv('train.csv', low_memory=False )
mtest = pd.read_csv('test.csv', low_memory=False )
mtrain.head(), mtest.head()

(   ID  GAGE  BAGE GP BP  AINCOME     STATUS
 0   1    41    41  A  B   113764    Married
 1   2    53    43  A  A   145963  Separated
 2   3    47    37  A  B    42857   Divorced
 3   4    47    44  B  A    95352    Married
 4   5    21    59  B  B   182138  Separated,
       ID  GAGE  BAGE GP BP  AINCOME  STATUS
 0  20001    38    28  B  B   181126     NaN
 1  20002    23    23  A  B    99510     NaN
 2  20003    28    27  A  A   159644     NaN
 3  20004    25    56  B  A    65539     NaN
 4  20005    28    28  B  A   178125     NaN)

In [34]:
def display_all(df):
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000): 
        display(df)

In [35]:
display_all(mtrain.tail().T)

Unnamed: 0,745,746,747,748,749
ID,746,747,748,749,750
GAGE,22,58,55,31,40
BAGE,54,58,45,24,40
GP,B,A,A,B,B
BP,A,A,A,B,B
AINCOME,105801,174820,108178,36314,135004
STATUS,Divorced,Separated,Separated,Separated,Separated


In [93]:
mtrain.isnull().sum()

AttributeError: 'NoneType' object has no attribute 'isnull'

In [7]:
mtest.isnull().sum()

ID           0
GAGE         0
BAGE         0
GP           0
BP           0
AINCOME      0
STATUS     264
dtype: int64

## Initial Processing

In [8]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.

    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.

    Examples:
    ---------

    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df

        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13

    >>> add_datepart(df, 'A')
    >>> df

        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [117]:
categorical = mtest.dtypes[mtest.dtypes == 'object'].index
categorical

Index(['GP', 'BP'], dtype='object')

In [112]:
list(mtrain.dtypes[mtrain.dtypes == 'object' ].index)

['GP', 'BP', 'STATUS']

In [113]:
mtest.dtypes[mtest.dtypes == 'object' ].index

Index(['GP', 'BP'], dtype='object')

In [120]:
def process_cat(df, categorical):
    for col in categorical:
        df = pd.concat([df, pd.get_dummies(df[col], prefix = col)], axis=1)
        df.drop(col, axis=1, inplace=True)
        #print(df.head())
    return(df)

In [119]:
mtrain = process_cat(mtrain, categorical)
mtrain.head()

   ID  GAGE  BAGE BP  AINCOME     STATUS  GP_A  GP_B
0   1    41    41  B   113764    Married     1     0
1   2    53    43  A   145963  Separated     1     0
2   3    47    37  B    42857   Divorced     1     0
3   4    47    44  A    95352    Married     0     1
4   5    21    59  B   182138  Separated     0     1
   ID  GAGE  BAGE  AINCOME     STATUS  GP_A  GP_B  BP_A  BP_B
0   1    41    41   113764    Married     1     0     0     1
1   2    53    43   145963  Separated     1     0     1     0
2   3    47    37    42857   Divorced     1     0     0     1
3   4    47    44    95352    Married     0     1     1     0
4   5    21    59   182138  Separated     0     1     0     1


Unnamed: 0,ID,GAGE,BAGE,AINCOME,STATUS,GP_A,GP_B,BP_A,BP_B
0,1,41,41,113764,Married,1,0,0,1
1,2,53,43,145963,Separated,1,0,1,0
2,3,47,37,42857,Divorced,1,0,0,1
3,4,47,44,95352,Married,0,1,1,0
4,5,21,59,182138,Separated,0,1,0,1


In [121]:
mtest = process_cat(mtest, categorical)
mtest.head()

Unnamed: 0,ID,GAGE,BAGE,AINCOME,STATUS,GP_A,GP_B,BP_A,BP_B
0,20001,38,28,181126,,0,1,0,1
1,20002,23,23,99510,,1,0,0,1
2,20003,28,27,159644,,1,0,1,0
3,20004,25,56,65539,,0,1,1,0
4,20005,28,28,178125,,0,1,1,0


In [54]:
def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    """ proc_df takes a data frame df and splits off the response variable, and
    changes the df into an entirely numeric dataframe.

    Parameters:
    -----------
    df: The data frame you wish to process.

    y_fld: The name of the response variable

    skip_flds: A list of fields that dropped from df.

    ignore_flds: A list of fields that are ignored during processing.

    do_scale: Standardizes each column in df. Takes Boolean Values(True,False)

    na_dict: a dictionary of na columns to add. Na columns are also added if there
        are any missing values.

    preproc_fn: A function that gets applied to df.

    max_n_cat: The maximum number of categories to break into dummy values, instead
        of integer codes.

    subset: Takes a random subset of size subset from df.

    mapper: If do_scale is set as True, the mapper variable
        calculates the values used for scaling of variables during training time (mean and standard deviation).

    Returns:
    --------
    [x, y, nas, mapper(optional)]:

        x: x is the transformed version of df. x will not have the response variable
            and is entirely numeric.

        y: y is the response variable

        nas: returns a dictionary of which nas it created, and the associated median.

        mapper: A DataFrameMapper which stores the mean and standard deviation of the corresponding continuous
        variables which is then used for scaling of during test-time.
    """
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

def fix_missing(df, col, name, na_dict):
    """ Fill missing data in a column of df with the median, and add a {name}_na column
    which specifies if the data was missing.

    Parameters:
    -----------
    df: The data frame that will be changed.

    col: The column of data to fix by filling in missing data.

    name: The name of the new filled column in df.

    na_dict: A dictionary of values to create na's of and the value to insert. If
        name is not a key of na_dict the median will fill any missing data. Also
        if name is not a key of na_dict and there is no missing data in col, then
        no {name}_na column is not created.
    """
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict
def numericalize(df, col, name, max_n_cat):
    """ Changes the column col from a categorical type to it's integer codes.

    Parameters:
    -----------
    df: A pandas dataframe. df[name] will be filled with the integer codes from
        col.

    col: The column you wish to change into the categories.
    name: The column name you wish to insert into df. This column will hold the
        integer codes.

    max_n_cat: If col has more categories than max_n_cat it will not change the
        it to its integer codes. If max_n_cat is None, then col will always be
        converted.
    """
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1

In [126]:
mtest.isnull().sum()

ID           0
GAGE         0
BAGE         0
AINCOME      0
STATUS     264
GP_A         0
GP_B         0
BP_A         0
BP_B         0
dtype: int64

In [122]:
x = mtrain.drop(['STATUS'], axis=1)
y = mtrain.STATUS

In [123]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=101)
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(600, 8)
(600,)
(150, 8)
(150,)


In [124]:
random_forest = RandomForestClassifier()
%time random_forest.fit(X_train, y_train)
rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)

Wall time: 34 ms
Random Forest Score: 
 99.67
Random Forest Test Score: 
 88.67


In [127]:
mtest.drop('STATUS', axis=1, inplace=True)

In [128]:
mtest.head()

Unnamed: 0,ID,GAGE,BAGE,AINCOME,GP_A,GP_B,BP_A,BP_B
0,20001,38,28,181126,0,1,0,1
1,20002,23,23,99510,1,0,0,1
2,20003,28,27,159644,1,0,1,0
3,20004,25,56,65539,0,1,1,0
4,20005,28,28,178125,0,1,1,0


In [129]:
#We save the submission as a '.csv' file
submission = pd.DataFrame({
        "ID": mtest['ID'],
        "STATUS": random_forest.predict(mtest)
    })

In [130]:
submission.head()

Unnamed: 0,ID,STATUS
0,20001,Separated
1,20002,Married
2,20003,Separated
3,20004,Divorced
4,20005,Married


In [131]:
submission.to_csv('MP_rf1.csv', index=False)

In [132]:
import math
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [143]:
random_forest = RandomForestClassifier(n_jobs=-1)
random_forest.fit(X_train, y_train)

rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)

Random Forest Score: 
 100.0
Random Forest Test Score: 
 86.0


In [145]:
preds = np.stack([t.predict(X_test) for t in random_forest.estimators_])

In [150]:
preds.shape

(10, 150)

In [151]:
preds[:,0], np.mean(preds[:,0]), y_test

(array([ 2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.]),
 2.0,
 516    Separated
 603    Separated
 31       Married
 127      Married
 191    Separated
 291      Married
 665    Separated
 237      Married
 194     Divorced
 645      Married
 723    Separated
 187    Separated
 138    Separated
 423    Separated
 601      Married
 213      Married
 521      Married
 42     Separated
 56      Divorced
 693     Divorced
 74       Married
 269    Separated
 183    Separated
 225      Married
 174     Divorced
 446      Married
 392      Married
 432      Married
 607      Married
 692    Separated
          ...    
 147    Separated
 447      Married
 228    Separated
 739    Separated
 89       Married
 449    Separated
 100    Separated
 157    Separated
 635     Divorced
 288    Separated
 68      Divorced
 257     Divorced
 12      Divorced
 339    Separated
 710    Separated
 493     Divorced
 2       Divorced
 414    Separated
 545    Separated
 611    Separated
 743     Divorce

In [154]:
random_forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
random_forest.fit(X_train, y_train)

rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)

Random Forest Score: 
 100.0
Random Forest Test Score: 
 84.67


In [157]:
random_forest = RandomForestClassifier(n_estimators=20, n_jobs=-1)
random_forest.fit(X_train, y_train)

rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)

Random Forest Score: 
 100.0
Random Forest Test Score: 
 90.0


In [156]:
random_forest = RandomForestClassifier(n_estimators=30, n_jobs=-1)
random_forest.fit(X_train, y_train)

rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)

Random Forest Score: 
 100.0
Random Forest Test Score: 
 87.33


In [158]:
submission = pd.DataFrame({
        "ID": mtest['ID'],
        "STATUS": random_forest.predict(mtest)
    })
submission.to_csv('MP_rf2.csv', index=False)