In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Data loading
diabetes = pd.read_csv("diabetes_dataset.csv")

# Get dataset information
dataset_id = "Dataset ID: diabetes_dataset"
num_features = "No. of Features: {}".format(len(diabetes.columns))
num_instances = "Number of Instances: {}".format(len(diabetes))
missing_values = "Missing Values: {}".format(diabetes.isnull().sum().sum())

# Outlier detection using IQR method
outliers = []
for column in diabetes.columns:
    Q1 = diabetes[column].quantile(0.25)
    Q3 = diabetes[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    column_outliers = diabetes[(diabetes[column] < lower_bound) | (diabetes[column] > upper_bound)].index.tolist()
    outliers.extend(column_outliers)

if len(outliers) > 0:
    outliers_text = "Outliers detected at indices: {}".format(outliers)
else:
    outliers_text = "No outliers detected"

# Display dataset information
print(dataset_id)
print(num_features)
print(num_instances)
print(missing_values)
print(outliers_text)

# Data separation as X and Y
y = diabetes["DiabetesPedigreeFunction"]
x = diabetes.drop('DiabetesPedigreeFunction', axis=1)

# Splitting data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model Building - Decision Tree
decision_tree_model = DecisionTreeRegressor()
decision_tree_model.fit(X_train, Y_train)

# Checking the performance of the model on the training set
y_model_train_pred = decision_tree_model.predict(X_train)
y_model_test_pred = decision_tree_model.predict(X_test)

# Evaluate model performance for Decision Tree
model_train_mse = mean_squared_error(Y_train, y_model_train_pred)
model_train_r2 = r2_score(Y_train, y_model_train_pred)
model_test_mse = mean_squared_error(Y_test, y_model_test_pred)
model_test_r2 = r2_score(Y_test, y_model_test_pred)

# Model Building - Random Forest
random_forest_model = RandomForestRegressor(max_depth=2, random_state=100)
random_forest_model.fit(X_train, Y_train)

# Applying the model to make predictions
y_rf_train_pred = random_forest_model.predict(X_train)
y_rf_test_pred = random_forest_model.predict(X_test)

# Evaluate model performance for Random Forest
rf_train_mse = mean_squared_error(Y_train, y_rf_train_pred)
rf_train_r2 = r2_score(Y_train, y_rf_train_pred)
rf_test_mse = mean_squared_error(Y_test, y_rf_test_pred)
rf_test_r2 = r2_score(Y_test, y_rf_test_pred)

# Model Comparison
decision_tree_results = pd.DataFrame(['Decision Tree', model_train_mse, model_train_r2, model_test_mse, model_test_r2]).transpose()
decision_tree_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

random_forest_results = pd.DataFrame(['Random Forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
random_forest_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']

model_comparison = pd.concat([decision_tree_results, random_forest_results], axis=0).reset_index(drop=True)

# Print the model comparison results
print(model_comparison)


Dataset ID: diabetes_dataset
No. of Features: 10
Number of Instances: 5
Missing Values: 0
Outliers detected at indices: [0, 4, 2, 4, 0, 3]
          Method Training MSE Training R2  Test MSE Test R2
0  Decision Tree          0.0         1.0  0.076176     NaN
1  Random Forest     0.119277     0.81539  0.060627     NaN


