# Random Forest

The assignment is to add a Random Forest to your Week 6 homework.

1)  Add an option to your handwritten Decision Tree so that it can consider only a random subset of features at each node (say, the sqrt(n_features)).

2)  Add a random forest trainer that will create N bootstrapped datasets, and train a random Decision Tree for each.

3)  Prediction should be based on a whichever outcome gets the most votes by the N trees.

You can test that it's working on the iris dataset.

Add an sklearn Random Forest to your model comparison for the Adult dataset like you did with the decision tree in the previous assignment.

---



In [154]:
import pandas as pd
import numpy as np 
# Algorithm 
from sklearn.ensemble import RandomForestClassifier
# Data Manipulation 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
# Tuning
from sklearn.model_selection import GridSearchCV

# Evaluation 
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef

df = pd.read_csv('iris.data')


# Decision Tree

In [155]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

df = pd.read_csv('PlayTennis.csv')

In [156]:
def p(node, target):
  '''
  This function 
  '''
  counts = Counter(node[target])
  return np.sum([(freq/len(node))**2 for key, freq in counts.items()])

def gini_index(df, col, val, target):
  '''
  This function takes a col and splits it by the value an returns the gini index of this split
  '''
  if df[col].dtype == 'O' or df[col].dtype == 'bool':
    left = df[(df[col] == val)]
    right = df[(df[col] !=  val)]
  else:
    left = df[(df[col] <= val)]
    right = df[(df[col] > val)]
  p1 = p(left, target)
  p2 = p(right, target)
  # Get the predictions for each side. The prediction is just the category
  # (yes or no) that has the most observations 
  try:
    left_pred = left[[target]].value_counts().sort_values(ascending = False).index[0][0]
    right_pred = right[[target]].value_counts().sort_values(ascending = False).index[0][0]
  except Exception:
    left_pred = ''
    right_pred = ''
  return len(left)/len(df)*(1- p1) + len(right)*(1 - p2)/len(df), (left_pred, right_pred)

In [157]:
def get_best_gini(df, col, target):
  '''
  This functions loops through all the values of a column and
  return the column with the lowest gini index (the best one) for the split
  '''
  vals = df[col].unique()
  vals = sorted(vals)
  best_val = 0
  best_gini = 1
  best_preds = ''
  for val in vals:
    current_gini, current_preds  = gini_index(df, col, val, target)
    if current_gini < best_gini:
      best_val = val
      best_gini = current_gini
      best_preds = current_preds
      
  return best_val, best_gini, best_preds


In [158]:
def get_best_column(df, target):
  '''
  This function picks the best column (the one with the lowest gini index) in
  the data frame and returns the column and the correct split
  '''
  best_col = ''
  best_gini = 10
  split_val = 0
  best_preds = ''
  X = list(df.columns)
  
  X.remove(target)
  for col in X:
    current_val, current_gini, current_preds = get_best_gini(df, col, target)
    if current_gini < best_gini:
      best_col = col
      best_gini = current_gini
      split_val = current_val
      best_preds = current_preds
  return best_col, split_val, best_gini, best_preds


In [159]:
def tree(node, max_depth, target):
  '''
  This function takes a node, and a max_depth of the tree.
  It gets the best split and procedes until the max_depth is achieved.
  '''
  current_df = node[0]
  #print(f'this are the columns in current_df = {current_df.columns}')
  col, split_by, gini, preds = get_best_column(current_df, target)
  #print(f'this is the col selected: {col}')
  if node[1] > max_depth or node[3] == 0:
    full_tree[-1][3] = full_tree[-1][3] + '-END'
    return full_tree 

  #Split the tree by the best value and remove this column

  if current_df[col].dtype == 'O' or current_df[col].dtype == 'bool':
    left_path = current_df[(current_df[col] == split_by)]
    right_path = current_df[(current_df[col] !=  split_by)]
  else:
    left_path = current_df[(current_df[col] <= split_by)]
    right_path = current_df[(current_df[col] > split_by)]

  full_tree.append(node[1:])
  # Crete two new paths 
  left_node = [left_path, node[1] + 1, (col, ' ==/<= ' , split_by), \
               gini, node[4] + 'L', preds[0]]
  right_node = [right_path, node[1] + 1, (col, '!=/> ', split_by), \
                gini, node[4] + 'R', preds[1]]

  #Run the tree for these nodes again
  tree(left_node, max_depth, target)
  tree(right_node, max_depth, target)
  
  return full_tree

In [160]:
def print_tree(x):
  '''
  Print tree in a nicer way (?)
  '''
  if len(x) == 0:
    return
  if x[0][0] > 0:
    print(f'Node: {x[0][0]}', x[0][0]*'--','>', x[0][1][0], x[0])
  new_tree = x[1:]
  print_tree(new_tree)
  return

## Predict Function


In [161]:
path  = []
def walk_path(tree, row, path):
  '''
  This function returns the path that each row creates as it walks down the tree
  '''
  if len(tree) == 0:
    return []

  row_dict = dict(row._asdict())
  try:
    split_feature = tree[1][1][0]
  except IndexError:
    return [] 
  current_index = tree[0][0]
  if isinstance(row_dict[split_feature], float) or \
  isinstance(row_dict[split_feature], int):
    if row_dict[split_feature] <= tree[1][1][2]:
      
      path.append('L')
    else:
      path.append('R')
  else:
    if row_dict[split_feature] == tree[1][1][2]:
      path.append('L')
    else:
      path.append('R')
  new_tree = [p for p in tree if p[3].startswith(''.join(path))]
  walk_path(new_tree, row, path)
  return path


def predict(tree, X_cols):
  '''
  This function uses the walk_path function and applies it to the whole df 
  '''
  preds = []
  for row in X_cols.itertuples():
    path = ''.join(walk_path(tree[1:], row, []))
    pred = [p[4] for p in tree if p[3] == path + "-END-END"]
    preds.append([row.Index, pred])
  return preds

In [162]:
def accuracy(y_preds, y_real):
  '''
  This function checks the accuracy of the tree
  '''
  #print(f'y_preds: {y_preds}')
  y_p = [x[1] for x in y_preds]
  y_real = np.array(y_real.tolist())
  print(f'y_real: {y_real}\ntype: {len(y_real), type(y_real)}'); print(f'y_p: {y_p}')
  return np.sum(y_p == y_real)/len(y_p)

## Iris Data Set

In [163]:
iris = pd.read_csv('iris.data', names = ['sepal-length'
                                          ,'sepal-widt'
                                          ,'petal-leng'
                                          ,'petal-width',
                                         'class'],
                   skiprows = 1)
# This transformation is neccesary for the way I did things. For some reason
# itertuples() does not read properly names with '-' in them.
iris.columns = iris.columns.str.replace('-','_')

# My Random Forest

In [165]:
full_tree = []
def RandomForest(df, target, X_test, n_bootstrap = 8, num_features = None):
  preds = []
  max_depth = 2
  for _ in range(n_bootstrap):
    # This is the bootstrap part
    df_bs = df.sample(axis = 0, frac = 0.3)
    import random
    import math 
    # This is the random features part
    if isinstance(num_features, type(None)) or num_features == 0 :
      num_features = int(math.sqrt(len(df.columns)))

    if num_features > len(df.columns) - 1:
      # If the number of features given is larger than the actual, give the
      num_features = len(df.columns) - 1

    # Select a random subset of features
    cols = list(df.columns)
    cols.remove(target)
    random_cols= random.sample(cols, k = num_features)
    sample_df = df.loc[:, random_cols + [target]]
    
    test_df = X_test.loc[:, random_cols]
    global full_tree 
    full_tree = []
    node0 = [sample_df, 0, '', 1, '', '']
    t1 = tree(node0, max_depth, target)
    sub_preds = predict(t1, test_df)

    preds.append(sub_preds)
  return preds


In [166]:
def get_highest(value):
  from collections import Counter
  dict_count = dict(Counter(value))
  try:
    return max(dict_count, key = dict_count.get)
  except ValueError:
    print('found error')
    return ''


def get_max_pred(preds):
  
  preds_dict = [*map(dict, preds)]
  final_dict = {}
  for index in preds_dict[0].keys():  
    final_dict[index] = []
    for x in range(len(preds_dict)):
      try:
        final_dict[index] += (preds_dict[x][index])      
      except KeyError:
        continue
  preds_dict = [[key, get_highest(value)] for key, value in final_dict.items()]
  return preds_dict 


In [167]:
cols = list(iris.columns)
cols.remove('class')
X_iris = iris.loc[:,cols]

X_train, X_test, y_train, y_test = train_test_split(X_iris, iris['class'], 
                                                    test_size=0.33, random_state=42)

iris_train = pd.concat([X_train, y_train], axis = 1)

random_result = RandomForest(iris_train, 'class', X_test)
random_preds = get_max_pred(random_result)

print(f"The accuracy of this Random Forest is: {accuracy(random_preds, y_test)}")


y_real: ['Iris-versicolor' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica'
 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-virginica'
 'Iris-virginica' 'Iris-virginica' 'Iris-virginica' 'Iris-virginica'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-setosa' 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor'
 'Iris-setosa' 'Iris-setosa' 'Iris-setosa' 'Iris-virginica'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-versicolor' 'Iris-virginica' 'Iris-virginica' 'Iris-versicolor'
 'Iris-virginica']
y_p: ['Iris-versicolor', 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolo

# SKlearn Random Forest

In [None]:
train = pd.read_csv('adult.data', names=['age',
              'workclass',
              'fnlwgt',
              'education',
              'education-num',
              'marital-status',
              'occupation',
              'relationship', 'race',
              'sex', 
              'capital-gain',
              'capital-loss',
              'hours-per',
              'native-country', 'y'], index_col = False).drop(['education'], axis = 1)

test = pd.read_csv('adul.test', names=['age',
              'workclass',
              'fnlwgt',
              'education',
              'education-num',
              'marital-status',
              'occupation',
              'relationship', 'race',
              'sex',
              'capital-gain',
              'capital-loss',
              'hours-per',
              'native-country', 'y'], index_col = False, skiprows = 1).\
              drop(['education'], axis = 1)

train['y'] = train['y'].apply(lambda x: x.replace('.', '').replace(' ', ''))
test['y'] = test['y'].apply(lambda x: x.replace('.', '').replace(' ', ''))

X_train = train.drop(['y'], axis = 1)
y_train = train['y']

X_test = test.drop(['y'], axis = 1)
y_test = test['y']

col_trans = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore', 
                                                   drop = 'first'), 
                                    list(X_train.select_dtypes(include = 'O').columns)),
                                    (StandardScaler(), 
                                     list(X_train.select_dtypes(exclude = 'O').columns)),
                                    remainder = 'passthrough'
                                     )

tree_pipe = Pipeline(steps = [
                         ('preprocess', col_trans), 
                         ('model', RandomForestClassifier())
                         ])


parameters = {#'model__max_depth': [i + 1  for i in range(50)],
              'model__max_features': [i + 1 for i in range(len(X_train.columns))]}

clf_pipe = GridSearchCV(tree_pipe, parameters, cv = 5,
                        scoring = make_scorer(matthews_corrcoef))

clf_pipe.fit(X_train, y_train)
y_preds = clf_pipe.predict(X_test)

accuracy_score(y_test, y_preds)



0.8551071801486395