In [1]:
import warnings
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score


from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import mean_squared_error

# Looking at our Dataset

In [48]:
df = pd.read_excel('data/final_dataset.xlsx')
df = df[df['Contract Years Left']!='fail']
df['Contract Years Left'] = df['Contract Years Left'].apply(lambda x:int(x))

In [119]:
df

Unnamed: 0,Player,Club,Age,Position,Nation,Value,Contract Years Left,League,Squad (20/21),MP (20/21),...,Offsides (17/18),Crosses (17/18),Interceptions (17/18),Penalty Kicks Won (17/18),Penalties Conceded (17/18),Own Goals (17/18),Total Loose Balls Recovered (17/18),Aerial Duel Won (17/18),Aerial Duel Lost (17/18),% Aerial Duels Won (17/18)
0,Kylian Mbappe,Paris Saint-Germain,22,attack,France,144000000,1,Ligue 1,Paris S-G,31.0,...,23.0,62.0,1.0,1.0,0.0,0.0,98.0,1.0,4.0,20.0
1,Erling Haaland,Borussia Dortmund,21,attack,Norway,117000000,3,Bundesliga,Dortmund,28.0,...,,,,,,,,,,
2,Harry Kane,Tottenham Hotspur,28,attack,England,108000000,3,Premier League,Tottenham,35.0,...,43.0,24.0,7.0,1.0,0.0,0.0,124.0,69.0,111.0,38.3
3,Jadon Sancho,Manchester United,21,attack,England,90000000,5,Premier League,Dortmund,26.0,...,1.0,15.0,6.0,0.0,0.0,0.0,57.0,3.0,14.0,17.6
4,Mohamed Salah,Liverpool FC,29,attack,Egypt,90000000,2,Premier League,Liverpool,37.0,...,18.0,50.0,13.0,1.0,0.0,0.0,219.0,19.0,58.0,24.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2070,Matis Carvalho,Montpellier HSC,22,Goalkeeper,Portugal,180000,1,Ligue 1,,,...,,,,,,,,,,
2071,Lorenzo Andrenacci,Genoa CFC,26,Goalkeeper,Italy,135000,3,Serie A,,,...,,,,,,,,,,
2072,Mamadou Doucoure,Borussia Mönchengladbach,23,Defender,France,90000,3,Bundesliga,,,...,,,,,,,,,,
2073,Michael Langer,FC Schalke 04,36,Goalkeeper,Austria,90000,1,Bundesliga,Schalke 04,3.0,...,,,,,,,,,,


In [None]:
club_dum = pd.get_dummies(df['Club'])
age_dum = pd.get_dummies(df['Age'])
pos_dum = pd.get_dummies(df['Position'])
nat_dum = pd.get_dummies(df['Nation'])
ctr_dum = pd.get_dummies(df['Contract Years Left'])
league_dum = pd.get_dummies(df['League'])

df_fbref = df.drop(['Club','Age','Position','Nation','Contract Years Left','League'],axis=1)

df_dum = pd.concat([df_fbref,club_dum,age_dum,pos_dum,nat_dum,ctr_dum,league_dum],axis=1)

# Baseline Linear Regression Model

In [None]:
def baseline_linear_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        lr = LinearRegression()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        lr.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(lr, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
        cross_val_test = cross_val_score(lr, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [None]:
baseline_linear_regression(df)

# Lasso Regression Model

In [None]:
def lasso_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:
        
        warnings.filterwarnings('ignore')
        
        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        l1 = Lasso()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        l1.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(l1, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
        cross_val_test = cross_val_score(l1, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [None]:
lasso_regression(df)

# Ridge Regression Model

In [None]:
def ridge_regression(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:
        
        warnings.filterwarnings('ignore')

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:11].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        l2 = Ridge()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        l2.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(l2, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
        cross_val_test = cross_val_score(l2, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [None]:
ridge_regression(df)

In [None]:
def decision_tree(df):

    attack = df[df['Position']=='attack']
    midfield = df[df['Position']=='midfield']
    defence = df[df['Position']=='Defender']

    positions =  [attack,midfield,defence]

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []

    for position in positions:
        
        warnings.filterwarnings('ignore')

        top_features = [a for a in position.corr()['Value'].sort_values(ascending=False)[:41].keys()]

        #Using top features identified earlier
        model_df = position[top_features]
        model_df = model_df.dropna()

        X = model_df.drop('Value',axis=1)
        y = model_df['Value']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

        ss= StandardScaler()
        DtReg = DecisionTreeRegressor()

        X_train_scaled = ss.fit_transform(X_train)
        X_test_scaled = ss.transform(X_test)

        DtReg.fit(X_train_scaled,y_train);

        cross_val_train = cross_val_score(DtReg, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
        cross_val_test = cross_val_score(DtReg, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)
        
        #Appending min to Max list and max to Min list because they are taken as negative values
        scores_train.append(-(cross_val_train.mean()))
        #scores_train_std.append(cross_val_train.std())
        scores_train_min.append(-(cross_val_train).max())
        scores_train_max.append(-(cross_val_train).min())
        
        scores_test.append(-(cross_val_test.mean()))
        #scores_test_std.append(cross_val_test.std())
        scores_test_min.append(-(cross_val_test).max())
        scores_test_max.append(-(cross_val_test).min())
        
    
    print("Attackers Train:")
    print(f'Attackers Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Attackers Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Attackers Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Attackers Test:")
    print(f'Attackers Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Attackers Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Attackers Test Min RMSE = ${round(scores_test_min[0],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Midfielders Train:")
    print(f'Midfielders Train Mean RMSE = ${round(scores_train[1],2)}')
    #print(f'Midfielders Train RMSE Std = ${round(scores_train_std[1],2)}')
    print(f'Midfielders Train Max RMSE = ${round(scores_train_max[1],2)}')
    print(f'Midfielders Train Min RMSE = ${round(scores_train_min[1],2)}')
    print("")
    print("Midfielders Test:")
    print(f'Midfielders Test Mean RMSE = ${round(scores_test[1],2)}')
    #print(f'Midfielders Test RMSE Std = ${round(scores_test_std[1],2)}')
    print(f'Midfielders Test Max RMSE = ${round(scores_test_max[1],2)}')
    print(f'Midfielders Test Min RMSE = ${round(scores_test_min[1],2)}')
    print("----------------------------------------")
    print("----------------------------------------")
    print("Defenders Train:")
    print(f'Defenders Train Mean RMSE = ${round(scores_train[2],2)}')
   # print(f'Defenders Train RMSE Std = ${round(scores_train_std[2],2)}')
    print(f'Defenders Train Max RMSE = ${round(scores_train_max[2],2)}')
    print(f'Defenders Train Min RMSE = ${round(scores_train_min[2],2)}')
    print("")
    print("Defenders Test:")
    print(f'Defenders Test Mean RMSE = ${round(scores_test[2],2)}')
    #print(f'Defenders Test RMSE Std = ${round(scores_test_std[2],2)}')
    print(f'Defenders Test Max RMSE = ${round(scores_test_max[2],2)}')
    print(f'Defenders Test Min RMSE = ${round(scores_test_min[2],2)}')

In [None]:
decision_tree(df)

In [None]:
def decision_tree(df):

    scores_train = []
    #scores_train_std = []
    scores_train_max = []
    scores_train_min = []
    scores_test = []
    #scores_test_std = []
    scores_test_max = []
    scores_test_min = []


        
    warnings.filterwarnings('ignore')

    #top_features = [a for a in df.corr()['Value'].sort_values(ascending=False)[:41].keys()]

    #Using top features identified earlier
    
    model_df = df.dropna()

    X = model_df.drop('Value',axis=1)
    y = model_df['Value']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

    ss= StandardScaler()
    DtReg = DecisionTreeRegressor()

    X_train_num = X_train.select_dtypes(exclude='object')
    X_train_
    
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)

    DtReg.fit(X_train_scaled,y_train);

    cross_val_train = cross_val_score(DtReg, X_train_scaled, y_train, scoring="neg_root_mean_squared_error",cv=20)
    cross_val_test = cross_val_score(DtReg, X_test_scaled, y_test, scoring="neg_root_mean_squared_error",cv=20)

    #Appending min to Max list and max to Min list because they are taken as negative values
    scores_train.append(-(cross_val_train.mean()))
    #scores_train_std.append(cross_val_train.std())
    scores_train_min.append(-(cross_val_train).max())
    scores_train_max.append(-(cross_val_train).min())

    scores_test.append(-(cross_val_test.mean()))
    #scores_test_std.append(cross_val_test.std())
    scores_test_min.append(-(cross_val_test).max())
    scores_test_max.append(-(cross_val_test).min())


    print("Train:")
    print(f'Train Mean RMSE = ${round(scores_train[0],2)}')
    #print(f'Attackers Train RMSE Std = ${round(scores_train_std[0],2)}')
    print(f'Train Max RMSE = ${round(scores_train_max[0],2)}')
    print(f'Train Min RMSE = ${round(scores_train_min[0],2)}')
    print("")
    print("Test:")
    print(f'Test Mean RMSE = ${round(scores_test[0],2)}')
    #print(f'Attackers Test RMSE Std = ${round(scores_test_std[0],2)}')
    print(f'Test Max RMSE = ${round(scores_test_max[0],2)}')
    print(f'Test Min RMSE = ${round(scores_test_min[0],2)}')

In [116]:
#Creating a dummy dataframe for categorical variable columns
dummy = []
for a in [a for a in df.select_dtypes(include='object').drop(['Player',
                                                              'Squad (20/21)',
                                                              'Squad (19/20)',
                                                              'Squad (18/19)',
                                                              'Squad (17/18)'],axis=1).columns]:
    dummy.append(pd.get_dummies(df[a]))

dummy_df = pd.concat([a for a in dummy],axis=1)

#Removing Categorical Variable Columns
df_num = df.select_dtypes(exclude='object')

#Combining Continuous Variable Columns with Dummy Categorical Variable Columns
df_final = (pd.concat([df_num,dummy_df],axis=1))

In [104]:
#Separating target variable - 'Value'
X = df_final.drop('Value',axis=1)
y = df_final['Value']

In [105]:
#Train-Test Split

X_train1, X_test1, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [106]:
#Removing the categorical dummy columns because we don't want to pass them throuh StandardScaler.

X_train_cat = X_train1.iloc[:, -199:]
X_train_cat.reset_index(drop=True, inplace=True)

X_test_cat = X_test1.iloc[:, -199:]
X_test_cat.reset_index(drop=True, inplace=True)


In [107]:
#Only including numerical columns to pass through StandardScaler

X_train_num = X_train1.iloc[:, :538]
X_train_num.reset_index(drop=True, inplace=True)

X_test_num = X_test1.iloc[:, :538]
X_test_num.reset_index(drop=True, inplace=True)


In [108]:
#Initiating Standard Scaler
ss= StandardScaler()

In [109]:
#Normalizing numerical columns

X_train_num_scaled = pd.DataFrame(ss.fit_transform(X_train_num),columns = X_train_num.columns)
X_train_num_scaled.reset_index(drop=True, inplace=True)

X_test_num_scaled = pd.DataFrame(ss.transform(X_test_num),columns = X_test_num.columns)
X_test_num_scaled.reset_index(drop=True, inplace=True)



In [110]:
#Combining Scaled Numerical Columns with Categorical Dummy Columns

X_train_scaled = pd.concat([X_train_num_scaled,X_train_cat],axis=1)
X_test_scaled = pd.concat([X_test_num_scaled,X_test_cat],axis=1)

In [112]:
DtReg = DecisionTreeRegressor()

In [113]:
DtReg.fit(X_train_scaled,y_train);

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').