In [1]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 


class NodeRegression():
    """
    Class to grow a regression decision tree
    """
    def __init__(
        self, 
        Y: list,
        X: pd.DataFrame,
        min_samples_split=None,
        max_depth=None,
        depth=None,
        node_type=None,
        rule=None
    ):
    
        self.Y = Y 
        self.X = X

        self.min_samples_split = min_samples_split if min_samples_split else 20
        self.max_depth = max_depth if max_depth else 5

        self.depth = depth if depth else 0

        self.features = list(self.X.columns)

    
        self.node_type = node_type if node_type else 'root'

        self.rule = rule if rule else "" 

    
        self.ymean = np.mean(Y)

        # Getting the residuals 
        self.residuals = self.Y - self.ymean

        # Calculating the mse of the node 
        self.mse = self.get_mse(Y, self.ymean)

        # Saving the number of observations in the node 
        self.n = len(Y)

        # Initiating the left and right nodes as empty nodes
        self.left = None 
        self.right = None 

        # Default values for splits
        self.best_feature = None 
        self.best_value = None 

        
    def get_mse(self, y_true, y_pred) -> float:
        """
        Method to calculate the mean squared error 
        """
        return np.mean((y_true - y_pred) ** 2)
    
    # X = [1,2,3,4]
    # window  = 2
    #out = [1.5,2.5,3.5]

    def ma(self, x: np.array, window: int) -> np.array:
        """
        Calculates the moving average of the given list. 
        """
        return np.convolve(x, np.ones(window), 'valid') / window

    def best_split(self) -> tuple:
        """
        Given the X features and Y targets calculates the best split 
        for a decision tree
        """
        # Creating a dataset for spliting
        df = self.X.copy()
        df['Y'] = self.Y

        # Getting the GINI impurity for the base input 
        mse_base = self.mse

        # Finding which split yields the best GINI gain 
        #max_gain = 0

        # Default best feature and split
        best_feature = None
        best_value = None

        for feature in self.features:
            # Droping missing values
            Xdf = df.dropna().sort_values(feature)

            # Sorting the values and getting the rolling average
            xmeans = self.ma(Xdf[feature].unique(), 2)

            for value in xmeans:
                # Getting the left and right ys 
                left_y = Xdf[Xdf[feature]<value]['Y'].values
                right_y = Xdf[Xdf[feature]>=value]['Y'].values

                # Getting the means 
                left_mean = np.mean(left_y)
                right_mean = np.mean(right_y)

                # Getting the left and right residuals 
                res_left = left_y - left_mean 
                res_right = right_y - right_mean

                # Concatenating the residuals 
                r = np.concatenate((res_left, res_right), axis=None)

                # Calculating the mse 
                n = len(r)
                r = r ** 2
                r = np.sum(r)
                mse_split = r / n

                # Checking if this is the best split so far 
                if mse_split < mse_base:
                    best_feature = feature
                    best_value = value 

                    # Setting the best gain to the current one 
                    mse_base = mse_split

        return (best_feature, best_value)
    
    
    # Growing tree recursively
    def grow_tree(self):
        """
        Recursive method to create the decision tree
        """
        # Making a df from the data 
        df = self.X.copy()
        df['Y'] = self.Y

        # If there is GINI to be gained, we split further 
        if (self.depth < self.max_depth) and (self.n >= self.min_samples_split):

            # Getting the best split 
            best_feature, best_value = self.best_split()

            if best_feature is not None:
                # Saving the best split to the current node 
                self.best_feature = best_feature
                self.best_value = best_value

                # Getting the left and right nodes
                left_df, right_df = df[df[best_feature]<=best_value].copy(), df[df[best_feature]>best_value].copy()

                # Creating the left and right nodes
                left = NodeRegression(
                    left_df['Y'].values.tolist(), 
                    left_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split, 
                    node_type='left_node',
                    rule=f"{best_feature} <= {round(best_value, 3)}"
                    )

                self.left = left 
                self.left.grow_tree()

                right = NodeRegression(
                    right_df['Y'].values.tolist(), 
                    right_df[self.features], 
                    depth=self.depth + 1, 
                    max_depth=self.max_depth, 
                    min_samples_split=self.min_samples_split,
                    node_type='right_node',
                    rule=f"{best_feature} > {round(best_value, 3)}"
                    )

                self.right = right
                self.right.grow_tree()

    def print_info(self, width=4):
        """
        Method to print the infromation about the tree
        """
        # Defining the number of spaces 
        const = int(self.depth * width ** 1.5)
        spaces = "-" * const
        
        if self.node_type == 'root':
            print("Root")
        else:
            print(f"|{spaces} Split rule: {self.rule}")
        print(f"{' ' * const}   | MSE of the node: {round(self.mse, 2)}")
        print(f"{' ' * const}   | Count of observations in node: {self.n}")
        print(f"{' ' * const}   | Prediction of node: {round(self.ymean, 3)}")   

    
    # Displaying the tree structure
    def print_tree(self):
        """
        Prints the whole tree from the current node to the bottom
        """
        self.print_info() 
        
        if self.left is not None: 
            self.left.print_tree()
        
        if self.right is not None:
            self.right.print_tree()



In [2]:
# Data Train test split with shuffle as true

def shuffle_data(X, y, seed=None):
    """ Random shuffle of the samples in X and y """
    if seed:
        np.random.seed(seed)
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    try:
        return X[idx], y[idx]
    except:
        return X.iloc[idx], y.iloc[idx]
    
    
def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
    """ Split the data into train and test sets """
    if shuffle:
        X, y = shuffle_data(X, y, seed)
    # Split the training data from test data in the ratio specified in
    # test_size
    split_i = len(y) - int(len(y) // (1 / test_size))
    X_train, X_test = X[:split_i], X[split_i:]
    y_train, y_test = y[:split_i], y[split_i:]

    return X_train, X_test, y_train, y_test

In [10]:
data_path = "EPL_Soccer_MLR_LR.csv"

df = pd.read_csv(data_path)

df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)


new_df = df.select_dtypes(['number'])



In [11]:

new_df = df.select_dtypes(['number'])

In [26]:

X = new_df.iloc[:,:-1]
y = new_df.iloc[:,-1]

In [27]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, seed=42)

In [28]:
#Calculation of Correlated Matrix
correlated_features = set()
correlation_matrix = X.corr()

In [29]:

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
correlated_features

{'Height', 'MinutestoGoalRatio', 'ShotsPerGame', 'Weight'}

In [30]:
# Dropping correlated features
X_train.drop(columns=correlated_features, axis=1, inplace=True)
X_test.drop(columns=correlated_features, axis=1, inplace=True)
X.drop(columns=correlated_features, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [31]:
# Creation of root node
root = NodeRegression(y_train, X_train, max_depth=2, min_samples_split=3)

In [32]:
root

<__main__.NodeRegression at 0x1ea0c2d9ee0>

In [33]:
# growing the tree recursively
root.grow_tree()

In [34]:

root.print_tree()

Root
   | MSE of the node: 35.87
   | Count of observations in node: 162
   | Prediction of node: 13.587
|-------- Split rule: Cost <= 68.05
           | MSE of the node: 5.33
           | Count of observations in node: 93
           | Prediction of node: 9.187
|---------------- Split rule: Cost <= 44.65
                   | MSE of the node: 1.8
                   | Count of observations in node: 44
                   | Prediction of node: 7.431
|---------------- Split rule: Cost > 44.65
                   | MSE of the node: 3.24
                   | Count of observations in node: 49
                   | Prediction of node: 10.764
|-------- Split rule: Cost > 68.05
           | MSE of the node: 15.78
           | Count of observations in node: 69
           | Prediction of node: 19.516
|---------------- Split rule: Cost <= 109.3
                   | MSE of the node: 6.13
                   | Count of observations in node: 50
                   | Prediction of node: 17.8
|--------------

In [35]:
data_path = "./EPL_Soccer_MLR_LR.csv"

df = pd.read_csv(data_path)

df.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)

#dropping categorical columns
new_df = df.select_dtypes(['number'])

# The last column (Score) is our dependent variable
X = new_df.iloc[:,:-1]
y = new_df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, seed=42)

correlated_features = set()
correlation_matrix = X.corr()


for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
X_train.drop(columns=correlated_features, axis=1, inplace=True)
X_test.drop(columns=correlated_features, axis=1, inplace=True)
X.drop(columns=correlated_features, axis=1, inplace=True)

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
#X, y = load_diabetes(return_X_y=True)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor, X, y, cv=5)

array([0.46196735, 0.74246175, 0.87827764, 0.73471287, 0.82410885])

In [16]:
acc = [0.46196735, 0.74246175, 0.87827764, 0.73471287, 0.82410885]

print("Accuracy of model : ", np.mean(acc))

Accuracy of model :  0.728305692


In [37]:
X_train

Unnamed: 0,DistanceCovered(InKms),Goals,AgentCharges,BMI,Cost,PreviousClubCost
95,4.66,6.4,109.0,18.37,38.2,41.93
15,4.36,5.8,29.0,21.86,99.9,56.52
30,4.87,6.4,64.0,20.17,99.8,52.72
159,5.48,4.6,132.0,32.52,55.7,102.00
186,5.29,12.7,124.0,23.38,75.9,74.00
...,...,...,...,...,...,...
173,5.59,7.9,220.0,23.55,41.9,63.00
131,5.00,6.7,72.0,24.64,49.6,79.00
17,4.51,8.3,34.0,21.27,69.9,56.31
72,4.77,7.1,40.0,26.85,103.6,66.85


In [18]:
# cost < 44.65, 45 nodes, 7.39 score

In [38]:
fil_df = new_df[new_df["Cost"]<=44.65]

In [39]:
fil_df.shape


(54, 11)

In [40]:
np.mean(fil_df["Score"])

7.292777777777779