# 4.8 Assignment 4: Decision Trees and Random Forest
This dataset has 93 entries of various blues guitarists born between 1874 and 1940. Apart from the name of the guitarists, that dataset contains the following four features:<p>
 - Regions: 1 means East, 2 means Delta, 3 means Texas
 - Years: 0 for those born before 1906, 1 for the rest
 - Hand postures: 1= Extended, 2= Stacked, 3=Lutiform
 - Thumb styles: Between 1 and 3, 1=Alternating, 2=Utility, 3=Dead

Step 1: Using decision tree on this dataset, how accurately you can tell their birth year from their hand postures and thumb styles. How does it affect the evaluation when you include the region while training the model?<p>

Step 2: Now do the same using random forest (in both of the above cases) and report the difference. Make sure to use appropriate training-testing parameters for your evaluation.<p>

You should also run the algorithms multiple times, measure various accuracies, and report the average (and perhaps the range).

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:
df = pd.read_csv('Assignment 4-blues_hand.csv')
print(df.columns)
display(df.head(93))

Index(['name', 'state', 'brthYr', 'post1906', 'region', 'handPost',
       'thumbSty'],
      dtype='object')


Unnamed: 0,name,state,brthYr,post1906,region,handPost,thumbSty
0,Henry Thomas,TX,1874,0,3,1,3
1,Frank Stokes,TN,1887,0,2,1,3
2,Sam Collins,MS,1887,0,2,1,2
3,Peg Leg Howell,GA,1888,0,1,2,2
4,Huddie Ledbetter,TX,1888,0,3,2,3
...,...,...,...,...,...,...,...
88,Jimmie Lee Harris,AL,1935,1,2,2,3
89,Snooks Eaglin,LA,1936,1,2,1,2
90,Larry Johnson,GA,1938,1,1,1,1
91,Tom Winslow,NC,1938,1,1,1,1


In [5]:
df.describe()

Unnamed: 0,brthYr,post1906,region,handPost,thumbSty
count,93.0,93.0,93.0,93.0,93.0
mean,1908.903226,0.548387,1.741935,1.580645,2.043011
std,13.44802,0.500351,0.657783,0.712048,0.832936
min,1874.0,0.0,1.0,1.0,1.0
25%,1898.0,0.0,1.0,1.0,1.0
50%,1908.0,1.0,2.0,1.0,2.0
75%,1917.0,1.0,2.0,2.0,3.0
max,1940.0,1.0,3.0,3.0,3.0


In [6]:
def count_nan(df: pd.DataFrame) -> pd.Series:
    """Counts the number of NaN (Not a Number) values in each column of a Pandas DataFrame.
    """
    if df.empty:
        return pd.Series(dtype=int)  # Return an empty Series if df is empty
    nan_counts = df.isnull().sum()
    return nan_counts
count_nan(df)

name        0
state       0
brthYr      0
post1906    0
region      0
handPost    0
thumbSty    0
dtype: int64

# Step 1 - Decision Tree
a. How accurately you can tell their birth year from their hand postures and thumb styles.<p>
b. How does it affect the evaluation when you include the region while training the model?

In [34]:
# Prepare the data
X = df[['handPost', 'thumbSty']]  # Predictor variables
y = df['brthYr']  # Target variable

print(f"Decision Tree Regressor with two features {list(X.columns)}")
# Split the data into training and testing sets
max_r2 = -2
for i in range(1,99):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    clf = DecisionTreeRegressor()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.2f} R-squared: {r2:.2f} random state: {i}")
    if r2 > max_r2:
        max_r2 = r2
print(max_r2)

Decision Tree Regressor with two features ['handPost', 'thumbSty']
Mean Squared Error: 168.56 R-squared: -0.05 random state: 1
Mean Squared Error: 198.16 R-squared: 0.07 random state: 2
Mean Squared Error: 186.71 R-squared: -0.02 random state: 3
Mean Squared Error: 161.24 R-squared: -0.24 random state: 4
Mean Squared Error: 189.41 R-squared: -0.83 random state: 5
Mean Squared Error: 193.63 R-squared: -0.03 random state: 6
Mean Squared Error: 181.42 R-squared: -0.01 random state: 7
Mean Squared Error: 208.26 R-squared: -0.03 random state: 8
Mean Squared Error: 173.71 R-squared: -0.78 random state: 9
Mean Squared Error: 144.31 R-squared: 0.06 random state: 10
Mean Squared Error: 133.45 R-squared: 0.07 random state: 11
Mean Squared Error: 168.37 R-squared: -0.39 random state: 12
Mean Squared Error: 278.65 R-squared: -0.42 random state: 13
Mean Squared Error: 182.11 R-squared: -0.24 random state: 14
Mean Squared Error: 213.46 R-squared: 0.05 random state: 15
Mean Squared Error: 155.33 R-sq

In [35]:
from sklearn.model_selection import cross_val_score

model = DecisionTreeRegressor(random_state=30)

# 5-fold cross-validation
scores = cross_val_score(model, X, y, scoring='r2', cv=5)

print(f"Cross-validated R2 scores: {scores}")
print(f"Average R2: {scores.mean():.2f}")

Cross-validated R2 scores: [-16.73381256 -12.80436267  -5.34569391 -27.50061609 -16.57086599]
Average R2: -15.79


In [31]:
# One Hot Encoding
columns_to_encode = ['region', 'handPost', 'thumbSty']
df_encoded = pd.get_dummies(df[columns_to_encode], columns=columns_to_encode, prefix=columns_to_encode)

X = df_encoded
y = df['brthYr']

# 1. Train & Predict on the SAME DATA (bad idea)
model_same = DecisionTreeRegressor(random_state=42)
model_same.fit(X, y)
y_pred_same = model_same.predict(X)

mse_same = mean_squared_error(y, y_pred_same)
r2_same = r2_score(y, y_pred_same)

print("⚠️ Train & Predict on Same Data:")
print(f"  MSE: {mse_same:.2f}")
print(f"  R2: {r2_same:.2f}")

# 2. Train/Test Split (proper way)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model_split = DecisionTreeRegressor(random_state=42)
model_split.fit(X_train, y_train)
y_pred_split = model_split.predict(X_test)

mse_split = mean_squared_error(y_test, y_pred_split)
r2_split = r2_score(y_test, y_pred_split)

print("\n✅ Train/Test Split:")
print(f"  MSE: {mse_split:.2f}")
print(f"  R2: {r2_split:.2f}")


⚠️ Train & Predict on Same Data:
  MSE: 130.13
  R2: 0.27

✅ Train/Test Split:
  MSE: 233.30
  R2: -0.39


In [33]:
# Prepare the data
X = df[['handPost', 'thumbSty', 'region']]  # Predictor variables
y = df['brthYr']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"Decision Tree Regressor with three features {list(X.columns)}")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Decision Tree Regressor with three features ['handPost', 'thumbSty', 'region']
Mean Squared Error: 275.01
R-squared: -0.41


In [12]:
df_encoded

Unnamed: 0,name,state,brthYr,post1906,region_1,region_2,region_3,handPost_1,handPost_2,handPost_3,thumbSty_1,thumbSty_2,thumbSty_3
0,Henry Thomas,TX,1874,0,False,False,True,True,False,False,False,False,True
1,Frank Stokes,TN,1887,0,False,True,False,True,False,False,False,False,True
2,Sam Collins,MS,1887,0,False,True,False,True,False,False,False,True,False
3,Peg Leg Howell,GA,1888,0,True,False,False,False,True,False,False,True,False
4,Huddie Ledbetter,TX,1888,0,False,False,True,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,Jimmie Lee Harris,AL,1935,1,False,True,False,False,True,False,False,False,True
89,Snooks Eaglin,LA,1936,1,False,True,False,True,False,False,False,True,False
90,Larry Johnson,GA,1938,1,True,False,False,True,False,False,True,False,False
91,Tom Winslow,NC,1938,1,True,False,False,True,False,False,True,False,False


In [14]:
columns_to_encode = ['region', 'handPost', 'thumbSty']
df_encoded = pd.get_dummies(df[columns_to_encode+['brthYr']], columns=columns_to_encode, prefix=columns_to_encode)

# Prepare your features (X) and target (y) using the encoded DataFrame
X = df_encoded.drop('brthYr', axis=1)
y = df_encoded['brthYr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"Decision Tree Regressor with encoded features {list(X.columns)}")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Decision Tree Regressor with encoded features ['region_1', 'region_2', 'region_3', 'handPost_1', 'handPost_2', 'handPost_3', 'thumbSty_1', 'thumbSty_2', 'thumbSty_3']
Mean Squared Error: 233.30
R-squared: -0.39


In [15]:
columns_to_encode = ['handPost', 'thumbSty']
df_encoded = pd.get_dummies(df[columns_to_encode+['brthYr']], columns=columns_to_encode, prefix=columns_to_encode)

# Prepare your features (X) and target (y) using the encoded DataFrame
X = df_encoded.drop('brthYr', axis=1)
y = df_encoded['brthYr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"Decision Tree Regressor with encoded features {list(X.columns)}")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Decision Tree Regressor with encoded features ['handPost_1', 'handPost_2', 'handPost_3', 'thumbSty_1', 'thumbSty_2', 'thumbSty_3']
Mean Squared Error: 205.14
R-squared: -0.23


In [None]:
from sklearn.model_selection import cross_val_score

model = RandomForestRegressor()
scores = cross_val_score(model, X, y, scoring='r2', cv=5)  # 5 folds

print(f"Cross-validated R2 scores: {scores}")
print(f"Average R2: {scores.mean():.2f}")


# Step 2
Use random forest (in both of the above cases) and report the difference. Make sure to use appropriate training-testing parameters for your evaluation.<p>

You should also run the algorithms multiple times, measure various accuracies, and report the average (and perhaps the range).

In [39]:
# Prepare the data
y = df['brthYr']  # Target variable

# Split the data into training and testing sets
for p in [['handPost', 'thumbSty'], ['handPost', 'thumbSty', 'region']]:
    X = df[p]  # Predictor variables
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestRegressor()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Decision Tree Regressor with three features {list(X.columns)}")
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")

Decision Tree Regressor with three features ['handPost', 'thumbSty']
Mean Squared Error: 264.90
R-squared: -0.35
Decision Tree Regressor with three features ['handPost', 'thumbSty', 'region']
Mean Squared Error: 250.70
R-squared: -0.28


In [43]:
# Prepare the data
 # Predictor variables
y = df['brthYr']  # Target variable

# Split the data into training and testing sets
for p in [['handPost', 'thumbSty'], ['handPost', 'thumbSty', 'region']]:
    X = df[p]
    print(f"RandomForestRegressor with two features {list(X.columns)}")
    max_r2 = -2
    for i in range(1,99):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
        clf = RandomForestRegressor()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)    
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Mean Squared Error: {mse:.2f} R-squared: {r2:.2f} random state: {i}")
        if r2 > max_r2:
            max_r2 = r2
    print(max_r2)

RandomForestRegressor with two features ['handPost', 'thumbSty']
Mean Squared Error: 170.40 R-squared: -0.06 random state: 1
Mean Squared Error: 205.27 R-squared: 0.03 random state: 2
Mean Squared Error: 189.86 R-squared: -0.03 random state: 3
Mean Squared Error: 148.56 R-squared: -0.14 random state: 4
Mean Squared Error: 184.99 R-squared: -0.78 random state: 5
Mean Squared Error: 193.69 R-squared: -0.03 random state: 6
Mean Squared Error: 183.65 R-squared: -0.02 random state: 7
Mean Squared Error: 198.32 R-squared: 0.02 random state: 8
Mean Squared Error: 176.59 R-squared: -0.80 random state: 9
Mean Squared Error: 142.46 R-squared: 0.08 random state: 10
Mean Squared Error: 136.83 R-squared: 0.05 random state: 11
Mean Squared Error: 167.48 R-squared: -0.39 random state: 12
Mean Squared Error: 271.85 R-squared: -0.39 random state: 13
Mean Squared Error: 185.58 R-squared: -0.26 random state: 14
Mean Squared Error: 215.66 R-squared: 0.04 random state: 15
Mean Squared Error: 153.69 R-squar

Mean Squared Error: 288.85 R-squared: -0.47 random state: 40
Mean Squared Error: 177.18 R-squared: 0.11 random state: 41
Mean Squared Error: 225.49 R-squared: -0.35 random state: 42
Mean Squared Error: 284.66 R-squared: -0.15 random state: 43
Mean Squared Error: 193.06 R-squared: -0.19 random state: 44
Mean Squared Error: 272.60 R-squared: -0.44 random state: 45
Mean Squared Error: 252.30 R-squared: -0.43 random state: 46
Mean Squared Error: 237.30 R-squared: -0.19 random state: 47
Mean Squared Error: 194.53 R-squared: -0.06 random state: 48
Mean Squared Error: 240.03 R-squared: -0.18 random state: 49
Mean Squared Error: 171.08 R-squared: -0.14 random state: 50
Mean Squared Error: 247.01 R-squared: -0.01 random state: 51
Mean Squared Error: 187.50 R-squared: -0.10 random state: 52
Mean Squared Error: 249.44 R-squared: -0.20 random state: 53
Mean Squared Error: 263.52 R-squared: -0.16 random state: 54
Mean Squared Error: 210.82 R-squared: -0.08 random state: 55
Mean Squared Error: 167.3

In [44]:
columns_to_encode = ['handPost', 'thumbSty']
df_encoded = pd.get_dummies(df[columns_to_encode+['brthYr']], columns=columns_to_encode, prefix=columns_to_encode)

# Prepare your features (X) and target (y) using the encoded DataFrame
X = df_encoded.drop('brthYr', axis=1)
y = df_encoded['brthYr']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(f"RandomForestRegressor with encoded features {list(X.columns)}")
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

RandomForestRegressor with encoded features ['handPost_1', 'handPost_2', 'handPost_3', 'thumbSty_1', 'thumbSty_2', 'thumbSty_3']
Mean Squared Error: 198.28
R-squared: -0.18


In [45]:
# Add constant for intercept
X_with_const = sm.add_constant(X)

model = sm.OLS(y, X_with_const)
results = model.fit()

print(results.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).