In [25]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier


import warnings
warnings.filterwarnings("ignore")

### Question 0
Load the telco data file we used in the previous labs. As we did there, we first read the telco file into a dataframe. Then we create the target, y, from the Churn column, and the features, X, are all the columns except for Churn. We then perform a train-test split. Now create a scaled version of the data, using Standard Scaler. Call the scaled version X_train_scaled, and X_test_scaled.


In [6]:
df=pd.read_csv("telco.csv")

y = df["Churn"] 
x = df.drop(columns=["Churn"])  

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

scaler = StandardScaler()


x_train_scaled = scaler.fit_transform(x_train)

x_test_scaled = scaler.transform(x_test)


print("X_train_scaled shape:", {x_train_scaled.shape})
print("X_test_scaled shape:", {x_test_scaled.shape})

X_train_scaled shape: {(4922, 29)}
X_test_scaled shape: {(2110, 29)}


<h2>Question 1</h2>
Build two versions of a default random forest model, using the normal data (X_train, X_test) and the scaled data (X_train_scaled, X_test_scaled). Print train and test scores for normal and scaled. </i>

In [10]:

rf_normal = RandomForestClassifier()
rf_normal.fit(x_train, y_train)

rf_scaled = RandomForestClassifier()
rf_scaled.fit(x_train_scaled, y_train)

train_score_normal = rf_normal.score(x_train, y_train)
test_score_normal = rf_normal.score(x_test, y_test)
train_score_scaled = rf_scaled.score(x_train_scaled, y_train)
test_score_scaled = rf_scaled.score(x_test_scaled, y_test)

print("Random Forest on Normal Data:")
print("Train Score:", {train_score_normal})
print("Test Score:", {test_score_normal})

print("\nRandom Forest on Scaled Data:")
print("Train Score:", {train_score_scaled})
print("Test Score:", {test_score_scaled})



Random Forest on Normal Data:
Train Score: {0.997765136123527}
Test Score: {0.7829383886255924}

Random Forest on Scaled Data:
Train Score: {0.997765136123527}
Test Score: {0.7819905213270142}


<h2>Question 2</h2>
Build two versions of a default mlp model, using the normal data (X_train, X_test) and the scaled data (X_train_scaled, X_test_scaled). Print train and test scores for normal and scaled. </i>

In [11]:
mlp_normal = MLPClassifier(max_iter=500)
mlp_normal.fit(x_train, y_train)

mlp_scaled = MLPClassifier(max_iter=500)
mlp_scaled.fit(x_train_scaled, y_train)


train_score_normal = mlp_normal.score(x_train, y_train)
test_score_normal = mlp_normal.score(x_test, y_test)
train_score_scaled = mlp_scaled.score(x_train_scaled, y_train)
test_score_scaled = mlp_scaled.score(x_test_scaled, y_test)

print("MLP on Normal Data:")
print(f"Train Score: {train_score_normal:.4f}")
print(f"Test Score: {test_score_normal:.4f}")

print("\nMLP on Scaled Data:")
print(f"Train Score: {train_score_scaled:.4f}")
print(f"Test Score: {test_score_scaled:.4f}")

MLP on Normal Data:
Train Score: 0.7928
Test Score: 0.7768

MLP on Scaled Data:
Train Score: 0.8885
Test Score: 0.7559


<h2>Question 2a</h2>
How much does scaling matter for the random forest model and mlp model you just tested? Why do you think that is?

so Multi layer perceptron uses backpropagation with gradient descent which is gradient based and so Scaling has a large impact. while random forest is based on decision trees which should no matter the scale of the features find the same split point thus making scaling less relevant.

<h2>Question 3</h2>
Now try mlp models with different values for the hidden layer size. Try with  two hidden layers, with sizes ranging from 1 to 20, incremented by 5. For each iteration, fit the model to the scaled data. Save the train and test scores for both in two lists. Also print the scaled and normal results at each iteration.

In [None]:
strain_scores = []
stest_scores = []
train_scores = []
test_scores = []


for size in range(1, 21, 5):  
    hidden_layer_sizes = (size, size)  
    print(f"\nTraining MLP with hidden layer sizes: {hidden_layer_sizes}")

    # Using random state to make results consistent since we are making two lists
    smlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, random_state=42, max_iter=500)
    smlp.fit(x_train_scaled, y_train)

    
    strain_score = smlp.score(x_train_scaled, y_train)
    stest_score = smlp.score(x_test_scaled, y_test)

  
    strain_scores.append(strain_score)
    stest_scores.append(stest_score)

   
    print(f"scaled Train Score: {strain_score:.4f}")
    print(f"scaled Test Score: {stest_score:.4f}")

    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, random_state=42, max_iter=500)
    mlp.fit(x_train, y_train)

 
    train_score = mlp.score(x_train, y_train)
    test_score = mlp.score(x_test, y_test)


    train_scores.append(train_score)
    test_scores.append(test_score)


    print(f"Train Score: {train_score:.4f}")
    print(f"Test Score: {test_score:.4f}")

# Final Results Summary
print("\nFinal Results:")
for i, size in enumerate(range(1, 21, 5)):
    print(f"Hidden Layer Size {size}-{size} -> scaled Train: {strain_scores[i]:.4f}, scaled Test: {stest_scores[i]:.4f}")
    print(f"Hidden Layer Size {size}-{size} -> Train: {train_scores[i]:.4f}, Test: {test_scores[i]:.4f}")


Training MLP with hidden layer sizes: (1, 1)
scaled Train Score: 0.8064
scaled Test Score: 0.7962
Train Score: 0.8037
Test Score: 0.7929

Training MLP with hidden layer sizes: (6, 6)
scaled Train Score: 0.8176
scaled Test Score: 0.7919
Train Score: 0.7972
Test Score: 0.7853

Training MLP with hidden layer sizes: (11, 11)
scaled Train Score: 0.8304
scaled Test Score: 0.7768
Train Score: 0.8009
Test Score: 0.7924

Training MLP with hidden layer sizes: (16, 16)
scaled Train Score: 0.8448
scaled Test Score: 0.7588
Train Score: 0.7948
Test Score: 0.7787

Final Results:
Hidden Layer Size 1-1 -> scaled Train: 0.8064, scaled Test: 0.7962
Hidden Layer Size 1-1 -> Train: 0.8037, Test: 0.7929
Hidden Layer Size 6-6 -> scaled Train: 0.8176, scaled Test: 0.7919
Hidden Layer Size 6-6 -> Train: 0.7972, Test: 0.7853
Hidden Layer Size 11-11 -> scaled Train: 0.8304, scaled Test: 0.7768
Hidden Layer Size 11-11 -> Train: 0.8009, Test: 0.7924
Hidden Layer Size 16-16 -> scaled Train: 0.8448, scaled Test: 0.

<h2>Question 4</h2>
Read the HomesSoldHellerup.csv file, using read_csv (note that the separator is a semicolon and not a comma, which is the default). Use the columns 'Type', 'm2', 'Build Year', and 'Type of Sale' (assign this to X).

In [26]:
df = pd.read_csv("HomesSoldHellerup.csv", sep=";")

y = df["Price"] 
x = df[["Type", "m2", "Build Year", "Type of Sale"]] 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)



<h2>Question 5</h2>
Use get_dummies to produce dummy values for the non-numeric columns in X. (Use the optional parameter, columns, with a list of the columns that should get dummy values.)

In [27]:
X_dummies = pd.get_dummies(x, columns=["Type", "Type of Sale"], drop_first=True)  # drop_first=True to avoid multicollinearity which will mes with our data

print(X_dummies.head())

   m2  Build Year  Type_Erhverv  Type_Lejlighed  Type_Rækkehus  Type_Stuehus  \
0  54        1932         False            True          False         False   
1  87        1932         False            True          False         False   
2  63        1932         False            True          False         False   
3  54        1932         False            True          False         False   
4  63        1932         False            True          False         False   

   Type_Villa  Type of Sale_Andet  Type of Sale_Auktion  \
0       False               False                 False   
1       False               False                 False   
2       False               False                 False   
3       False               False                 False   
4       False               False                 False   

   Type of Sale_Fam. Salg  
0                   False  
1                   False  
2                   False  
3                   False  
4                   Fals

<h2>Question 6</h2>
How many columns were added by get_dummies?
(Hint: Compare the number of columns from the initial data frame and the new data frame including the dummy variables.)

In [21]:
original_cols = x.shape[1]  # Number of original columns
new_cols = X_dummies.shape[1]  # Number of columns after get_dummies
added_cols = new_cols - original_cols  # Number of added columns

print(f"Original columns: {original_cols}")
print(f"New columns after get_dummies: {new_cols}")
print(f"Number of columns added: {added_cols}")

Original columns: 4
New columns after get_dummies: 10
Number of columns added: 6


<h2>Question 7</h2>
Create a Linear Regression model for two versions of the data -- the first with only numeric columns, m2 and Build Year. Then use the version with dummy values for Type and Type of Sale. Define the column price as the target, y. Conduct two train test splits, for the two versions of the data. One for the version with dummy variables, and one only including the columns m2 and Build Year.

Print the score for train and test for each version.

In [28]:
# Version 1: Only numeric columns (m2, Build Year)
X_numeric = df[["m2", "Build Year"]]

# Version 2: Numeric columns + Dummy Variables
X_dummies = pd.get_dummies(df[["Type", "m2", "Build Year", "Type of Sale"]], columns=["Type", "Type of Sale"], drop_first=True)

# Train-test split for both versions
X_train_num, X_test_num, y_train, y_test = train_test_split(X_numeric, y, test_size=0.3, random_state=42)
X_train_dum, X_test_dum, y_train, y_test = train_test_split(X_dummies, y, test_size=0.3, random_state=42)

In [29]:
lr_numeric = LinearRegression()
lr_numeric.fit(X_train_num, y_train)

lr_dummies = LinearRegression()
lr_dummies.fit(X_train_dum, y_train)

# Get scores
train_score_num = lr_numeric.score(X_train_num, y_train)
test_score_num = lr_numeric.score(X_test_num, y_test)

train_score_dum = lr_dummies.score(X_train_dum, y_train)
test_score_dum = lr_dummies.score(X_test_dum, y_test)

# Print results
print("Linear Regression with Only Numeric Features:")
print(f"Train Score: {train_score_num:.4f}")
print(f"Test Score: {test_score_num:.4f}")

print("\nLinear Regression with Dummy Variables:")
print(f"Train Score: {train_score_dum:.4f}")
print(f"Test Score: {test_score_dum:.4f}")

Linear Regression with Only Numeric Features:
Train Score: 0.3095
Test Score: 0.3665

Linear Regression with Dummy Variables:
Train Score: 0.3420
Test Score: 0.3901


<h2>Question 8</h2>
Create a Decision Tree model for the same two versions of the data that you used in the previous question. Print the score for train and test for each version.

It was unclear which kind of decision tree so i made both. i hope this explains the duplication


In [33]:
dt_numeric = DecisionTreeRegressor(random_state=42)
dt_numeric.fit(X_train_num, y_train)

dt_dummies = DecisionTreeRegressor(random_state=42)
dt_dummies.fit(X_train_dum, y_train)

dtc_numeric = DecisionTreeClassifier(random_state=42)
dtc_numeric.fit(X_train_num, y_train)

dtc_dummies = DecisionTreeClassifier(random_state=42)
dtc_dummies.fit(X_train_dum, y_train)

# Get scores
train_score_num = dt_numeric.score(X_train_num, y_train)
test_score_num = dt_numeric.score(X_test_num, y_test)

train_score_dum = dt_dummies.score(X_train_dum, y_train)
test_score_dum = dt_dummies.score(X_test_dum, y_test)

train_score_numc = dtc_numeric.score(X_train_num, y_train)
test_score_numc = dtc_numeric.score(X_test_num, y_test)

train_score_dumc = dtc_dummies.score(X_train_dum, y_train)
test_score_dumc = dtc_dummies.score(X_test_dum, y_test)

# Print results
print("Decision Tree regressor with Only Numeric Features:")
print(f"Train Score: {train_score_num:.4f}")
print(f"Test Score: {test_score_num:.4f}")

print("\nDecision Tree regressor with Dummy Variables:")
print(f"Train Score: {train_score_dum:.4f}")
print(f"Test Score: {test_score_dum:.4f}")

print("\nDecision Tree classifier with Only Numeric Features:")
print(f"Train Score: {train_score_num:.4f}")
print(f"Test Score: {test_score_num:.4f}")

print("\nDecision Tree classifier  with Dummy Variables:")
print(f"Train Score: {train_score_dum:.4f}")
print(f"Test Score: {test_score_dum:.4f}")

Decision Tree regressor with Only Numeric Features:
Train Score: 0.9837
Test Score: 0.2382

Decision Tree regressor with Dummy Variables:
Train Score: 0.9928
Test Score: 0.3546

Decision Tree classifier with Only Numeric Features:
Train Score: 0.9837
Test Score: 0.2382

Decision Tree classifier  with Dummy Variables:
Train Score: 0.9928
Test Score: 0.3546


<h2>Question 9</h2>
Now add dummy values for Road Name to the data. Create a train test split with the new version of the data. Create a Decision Tree model for this version of the data. Print the score for train and test for each version.

In [None]:
Xnew_dummies = pd.get_dummies(df[["Type", "m2", "Build Year", "Type of Sale", "Road name"]], columns=["Type", "Type of Sale", "Road name"], drop_first=True)
X_train_dum, X_test_dum, y_train, y_test = train_test_split(Xnew_dummies, y, test_size=0.2, random_state=42)

dt_dummies = DecisionTreeRegressor(random_state=42)
dt_dummies.fit(X_train_dum, y_train)

dtc_dummies = DecisionTreeClassifier(random_state=42)
dtc_dummies.fit(X_train_dum, y_train)

# Get scores for the Decision Tree model
train_score_dum = dt_dummies.score(X_train_dum, y_train)
test_score_dum = dt_dummies.score(X_test_dum, y_test)

train_score_dumc = dtc_dummies.score(X_train_dum, y_train)
test_score_dumc = dtc_dummies.score(X_test_dum, y_test)

# Print results
print("\nDecision Tree classifier with Dummy Variables (Including Road Name):")
print(f"Train Score: {train_score_dum:.4f}")
print(f"Test Score: {test_score_dum:.4f}")

print("\nDecision Tree classifier with Dummy Variables (Including Road Name):")
print(f"Train Score: {train_score_dumc:.4f}")
print(f"Test Score: {test_score_dumc:.4f}")



Decision Tree with Dummy Variables (Including Road Name):
Train Score: 0.9940
Test Score: 0.6117

Decision Tree with Dummy Variables (Including Road Name):
Train Score: 0.7772
Test Score: 0.0139


<h2> Question 10</h2>
Build a Random Forest regressor for this data. Set random_state, with otherwise default settings, and print train and test scores.

In [39]:
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train_dum, y_train)

# Get scores for Random Forest model
train_score_rf = rf_regressor.score(X_train_dum, y_train)
test_score_rf = rf_regressor.score(X_test_dum, y_test)

# Print results
print("\nRandom Forest Regressor with Dummy Variables (Including Road Name):")
print(f"Train Score: {train_score_rf:.4f}")
print(f"Test Score: {test_score_rf:.4f}")


Random Forest Regressor with Dummy Variables (Including Road Name):
Train Score: 0.9575
Test Score: 0.7187


<h2>Question 10a</h2>

Based on your results, which categorical features are most informative about price with this dataset?



In [42]:
feature_importance = rf_regressor.feature_importances_

features = Xnew_dummies.columns
feature_importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importance
})

feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
print(feature_importance_df.head(10))


                         Feature  Importance
0                             m2    0.412616
1                     Build Year    0.231126
77       Road name_Hellerupvej      0.163009
69      Road name_Hambros Alle      0.034034
9         Type of Sale_Fam. Salg    0.013424
7             Type of Sale_Andet    0.012241
154       Road name_Sundvænget      0.010591
135  Road name_Richelieus Alle      0.009165
106       Road name_Lemchesvej      0.006434
156    Road name_Svanemøllevej      0.006288


it appears that type of sale and road name are the most informative in this dataset