# Decision Tree Regression

In [9]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import export_graphviz
from sklearn import tree
from sklearn.metrics import accuracy_score, r2_score

## Load Dataset

In [3]:
df = pd.read_csv('dataset/data_merged.csv')
mean_popularity = 50
# df["popularity"] = [ 1 if i >= mean_popularity else 0 for i in df.popularity ]

In [4]:
df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,liveness,loudness,popularity,speechiness,...,valence_yr,popularity_yr,mode,key_0_yr,key_1_yr,key_2_yr,key_3_yr,key_4_yr,key_5_yr,key_6_yr
0,0.0131,0.256,182347,0.895,0,0.000106,0.0821,-4.86,29,0.0707,...,0.583424,35.272231,1,0,0,1,0,0,0,0
1,0.98,0.277,206972,0.145,0,0.879,0.111,-19.898,0,0.0845,...,0.432251,3.6725,1,1,0,0,0,0,0,0
2,0.795,0.685,314667,0.483,0,0.878,0.113,-10.202,1,0.0337,...,0.447291,7.707,1,0,0,0,1,0,0,0
3,0.656,0.788,179747,0.808,0,0.0,0.154,-6.59,0,0.0395,...,0.447291,7.707,1,0,0,0,1,0,0,0
4,0.302,0.0753,498560,0.15,0,0.884,0.121,-16.705,0,0.0371,...,0.443625,3.4195,1,1,0,0,0,0,0,0


## Split train test data

In [5]:
X  = df.drop(['popularity'], axis=1)
y = df['popularity']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [21]:
tree_reg = DecisionTreeRegressor()
# tree_reg.fit(X_train, y_train)

In [8]:
y_pred_train = tree_reg.predict(X_train)

## Performance evaluation

In [10]:
r2_score(y_train, y_pred_train)

0.9980183160972207

In [11]:
y_pred = tree_reg.predict(X_test)

In [12]:
r2_score(y_test, y_pred)

0.7519372556750243

## Grid Search

In [51]:
# criterion = ['gini', 'entropy']
max_depth = [10,15]
min_samples_split = [6,8]
min_samples_leaf = [5,6,7,8]

In [52]:
parameters=dict(max_depth=max_depth,
               min_samples_split=min_samples_split,
               min_samples_leaf=min_samples_leaf)

In [53]:
tree_reg_gs = GridSearchCV(tree_reg, parameters)
tree_reg_gs.fit(X_train,y_train)

GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': [10, 15],
                         'min_samples_leaf': [5, 6, 7, 8],
                         'min_samples_split': [6, 8]})

In [54]:
print('Best max_depth:', tree_reg_gs.best_estimator_.get_params()['max_depth'])
print('Best min_samples_split:', tree_reg_gs.best_estimator_.get_params()['min_samples_split'])
print('Best min_samples_leaf:', tree_reg_gs.best_estimator_.get_params()['min_samples_leaf'])

Best max_depth: 10
Best min_samples_split: 6
Best min_samples_leaf: 7


In [56]:
y_pred_train = tree_reg_gs.predict(X_train)
r2_score(y_train, y_pred_train)

0.8698632825417648

In [58]:
y_pred = tree_reg_gs.predict(X_test)
r2_score(y_test, y_pred)

0.8562964052050465