In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Data loading
diabetes = pd.read_csv("diabetes_dataset.csv")

# Data separation as X and Y
y = diabetes["DiabetesPedigreeFunction"]
x = diabetes.drop('DiabetesPedigreeFunction', axis=1)

# Splitting data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Building - Decision Tree
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, Y_train)

# Checking the performance of the model on the training set
y_model_train_pred = decision_tree_model.predict(X_train)
y_model_test_pred = decision_tree_model.predict(X_test)

# Evaluate model performance for Decision Tree
model_train_mse = mean_squared_error(Y_train, y_model_train_pred)
model_train_r2 = r2_score(Y_train, y_model_train_pred)
model_test_mse = mean_squared_error(Y_test, y_model_test_pred)
model_test_r2 = r2_score(Y_test, y_model_test_pred)

# Model Building - Random Forest
random_forest_model = RandomForestRegressor(max_depth=2, random_state=100)
random_forest_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_rf_train_pred = random_forest_model.predict(X_train)
y_rf_test_pred = random_forest_model.predict(X_test)

# Evaluate model performance for Random Forest
rf_train_mse = mean_squared_error(Y_train, y_rf_train_pred)
rf_train_r2 = r2_score(Y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(Y_test, y_rf_test_pred)
rf_test_r2 = r2_score(Y_test, y_rf_test_pred)

# Model Comparison
decision_tree_results = pd.DataFrame(['Decision Tree', model_train_mse, model_train_r2, model_test_mse, model_test_r2]).transpose()
decision_tree_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

random_forest_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
random_forest_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

model_comparison = pd.concat([decision_tree_results, random_forest_results], axis=0).reset_index(drop=True)

# Print the model comparison results
print(model_comparison)


          Method Training MSE Training R2  Test MSE Test R2
0  Decision Tree          0.0         1.0  0.033856     NaN
1  Random Forest     0.119277     0.81539  0.060627     NaN


