## RANDOM FORESTS using only NumPy and Pandas

In [135]:
import numpy as np
import pandas as pd

In [136]:

import numpy as np
import pandas as pd
import random

class Node:
    def __init__(self, feature=None, impurity=None, thresh=None, left_node=None, right_node=None, value=None, isLeaf=False):
        self.feature = feature
        self.impurity = impurity
        self.thresh = thresh
        self.left_node = left_node
        self.right_node = right_node
        self.value = value
        self.isLeaf = isLeaf


class DecisionTree:
    def __init__(self, max_depth=3, min_rows_split=3):
        self.min_rows_split = min_rows_split
        self.max_depth = max_depth
        self.root = None
    
    def fit(self, X, y):
        df = pd.concat([X, y], axis = 1)
        self.root = self.make_tree(0, df)
    
    def make_tree(self, depth, df):
        rows = len(df.iloc[:,:-1])
        features = list(df.iloc[:,:-1].columns)
        if self.max_depth > depth and rows > self.min_rows_split:
            best_split = self.optimal_feature(df)
            if best_split['gini'] > 0:
                l_tree = self.make_tree(depth+1, best_split['left_split'])
                r_tree = self.make_tree(depth+1, best_split['right_split'])
                return Node(best_split['feat'], best_split['gini'], best_split['thresh'], l_tree, r_tree)
        print('df:\n', df,'\n')
        final_value = self.final_label(df.iloc[:,-1])
        return Node(value=final_value, isLeaf=True)


    def optimal_feature(self, df):
        X = df.iloc[:,:-1]
        y = df.iloc[:,-1]
        features = list(X.columns)
        total_rows = len(X)
        min_gini = float("inf")
        for feature in features:
            values = df[feature].unique()
            for val in values:
                l_condition = df[feature] <= val
                r_condition = df[feature] > val
                l_df = df[l_condition]
                r_df = df[r_condition]
                gini = self.weighted_gini_impurity(l_df, r_df)
                if gini < min_gini:
                    min_gini = gini
                    result = {
                        'feat': feature,
                        'thresh': val,
                        'left_split': l_df,
                        'right_split': r_df,
                        'gini': gini
                    }
        return result

    def weighted_gini_impurity(self, l, r):
        weight_r = len(r)/(len(l)+len(r))
        weight_l = 1 - weight_r
        return (weight_r*(self.gini_impurity(r)) + weight_l*(self.gini_impurity(l)))


    def gini_impurity(self, subset):
        target  = subset.iloc[:,-1]
        sub_label_distr = dict(target.value_counts())
        total = len(target)
        sum_squares = 0
        for label in sub_label_distr.keys():
            sum_squares += (sub_label_distr[label]/total)**2
        impurity = 1 - sum_squares
        return impurity
    
    def final_label(self, col):
        return (dict(map(reversed, dict(col.value_counts()).items()))[max(dict(col.value_counts()).values())])

    def predict_sub(self, input, tree):
        if tree.isLeaf:
            return tree.value
        else:
            limit = tree.thresh
            feat = tree.feature
            if input[feat]<=limit:
                return self.predict_sub(input, tree.left_node)
            else:
                return self.predict_sub(input, tree.right_node)
    
    def predict(self, inputs):
        preds = []
        for i in range(len(inputs)):
            preds.append(self.predict_sub(inputs.iloc[i], self.root))
        return preds

class RandomForest:
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        self.forest = None
        self.prediction_labels = None

    def create_forest(self, df):
        datasets = self.get_all_bootstraps(df)
        trees = []
        for dataset in datasets:
            tree = DecisionTree()
            tree.fit(dataset.iloc[:,:-1], dataset.iloc[:, -1])
            trees.append(tree)
        return trees

    def fit(self, X, y):
        df = pd.concat([X, y], axis = 1)
        self.prediction_labels = list(y.unique())
        self.forest = self.create_forest(df)

    def bootstrap_dataset(self, df):
        boot_indices = []
        for i in range(len(df)):
            boot_indices.append(random.randint(0, len(df) - 1))
        boot_df = pd.DataFrame()
        for index in boot_indices:
            boot_df = pd.concat([boot_df, df.iloc[index]], axis = 1)  
        boot_df = boot_df.transpose()
        return boot_df

    def predict_matrix(self, inputs):
        predictions_df = pd.DataFrame()
        for tree in self.forest:
            preds_by_tree = tree.predict(inputs)
            predictions_df = pd.concat([predictions_df, pd.Series(preds_by_tree)], axis = 1)
        return predictions_df.transpose()

    def predict(self, inputs):
        matrix = self.predict_matrix(inputs)
        final_preds = []
        for tree in matrix.columns:
            final_preds.append(self.max_dict(dict((matrix[tree].value_counts()))))
        return final_preds

    def max_dict(self, d):
        rev = dict(map(reversed, d.items()))
        return rev[max(list(d.values()))]

    def get_all_bootstraps(self, df):
        bootstraps = []
        features = list(df.iloc[:,:-1].columns)
        for i in range(self.n_estimators):
            bootstraps.append({
                'df': self.bootstrap_dataset(df),
                'features': list(np.random.choice(features, size = int(np.ceil(np.sqrt(len(features)))), replace=False))+[df.columns[-1]]
            })
        bootstrapped_datasets = []
        for bootstrap in bootstraps:
            data_frame = bootstrap['df']
            feats = bootstrap['features']
            bootstrapped_datasets.append(data_frame[feats])
        return bootstrapped_datasets

In [137]:
df = pd.read_csv("/kaggle/input/music-genres/genre_df.csv", sep = '\t');
df.drop(["Unnamed: 0"], inplace = True, axis = 1)

In [138]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      1339 non-null   float64
 1   energy            1339 non-null   float64
 2   key               1339 non-null   int64  
 3   loudness          1339 non-null   float64
 4   mode              1339 non-null   int64  
 5   speechiness       1339 non-null   float64
 6   acousticness      1339 non-null   float64
 7   instrumentalness  1339 non-null   float64
 8   liveness          1339 non-null   float64
 9   valence           1339 non-null   float64
 10  tempo             1339 non-null   float64
 11  duration_ms       1339 non-null   int64  
 12  time_signature    1339 non-null   int64  
 13  track_name        1339 non-null   object 
 14  artist            1339 non-null   object 
 15  genre             1339 non-null   object 
dtypes: float64(9), int64(4), object(3)
memory 

In [139]:
df.drop(["artist", "track_name", "liveness", "key", "time_signature", "duration_ms"], axis = 1, inplace= True)

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1339 entries, 0 to 1338
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   danceability      1339 non-null   float64
 1   energy            1339 non-null   float64
 2   loudness          1339 non-null   float64
 3   mode              1339 non-null   int64  
 4   speechiness       1339 non-null   float64
 5   acousticness      1339 non-null   float64
 6   instrumentalness  1339 non-null   float64
 7   valence           1339 non-null   float64
 8   tempo             1339 non-null   float64
 9   genre             1339 non-null   object 
dtypes: float64(8), int64(1), object(1)
memory usage: 104.7+ KB


In [141]:
from sklearn.metrics import classification_report

In [142]:
forest = RandomForest(n_estimators=5)

In [143]:
from sklearn.model_selection import train_test_split

X = df.drop(['genre'], axis = 1)
y = df['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [144]:
forest.fit(X_train, y_train)

df:
      energy acousticness valence genre
189   0.751      0.00003    0.13   POP
1058  0.575      0.00294   0.498  LOFI
1147  0.731      0.00782   0.782  ROCK
1062    0.3      0.00407   0.283  LOFI
1143  0.675      0.00185   0.526  ROCK
...     ...          ...     ...   ...
1143  0.675      0.00185   0.526  ROCK
1147  0.731      0.00782   0.782  ROCK
1070   0.79     0.000004    0.22  ROCK
1138  0.722       0.0069    0.92  ROCK
1062    0.3      0.00407   0.283  LOFI

[65 rows x 4 columns] 

df:
      energy acousticness valence genre
1082  0.825     0.000071    0.56  ROCK
1195  0.912     0.000181   0.636  ROCK
1083  0.872     0.000007   0.449  ROCK
1182  0.965     0.000581   0.106  ROCK
1277  0.995     0.000104   0.159  ROCK
...     ...          ...     ...   ...
1165  0.974     0.000017   0.556  ROCK
1180  0.981      0.00129   0.689  ROCK
1330  0.965     0.000152   0.526  ROCK
1185  0.977      0.00274    0.47  ROCK
1260   0.87      0.00791   0.409  ROCK

[127 rows x 4 columns] 

df:

In [147]:
preds = forest.predict(X_test)

In [148]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

   CLASSICAL       0.86      1.00      0.92        66
        LOFI       0.83      0.78      0.80        63
         POP       0.73      0.18      0.29        61
         RAP       0.66      0.82      0.73        76
        ROCK       0.69      0.90      0.78        69

    accuracy                           0.75       335
   macro avg       0.75      0.73      0.71       335
weighted avg       0.75      0.75      0.71       335

[[66  0  0  0  0]
 [11 49  0  1  2]
 [ 0  4 11 25 21]
 [ 0  5  4 62  5]
 [ 0  1  0  6 62]]
