In [None]:
#importing the drive from google colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#importing numpy, pandas, train test split, grid search cv, scaler, one hot encoder, column transformer, pipeline, Regressors (decision tree, K neighbors), and metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [None]:
#loading the dataset into pandas dataframe
#here I am using the dataset cleaned by my groupmates, and not tableau dataset which is slightly different. This dataset is already available in git repo
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Final_csv_File.csv")

In [None]:
#splitting the data into features and target variable
X = df.drop('Final Amount', axis=1)
y = df['Final Amount']

In [None]:
#isolating categorical features and numerical features
categorical_features = ['location']
numerical_features = [
    'Carpet Area in sqft', 'Bathroom', 'Balcony', 'BHK', 'Super Area in sqft',
    'Price (in rupees)', 'Status', 'Transaction', 'Furnishing', 'facing', 'Ownership'
]

In [None]:
#Using scaler for numerical features and OHE for categorical features i.e location.
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        # Apply the 'numerical_transformer' (StandardScaler) to all columns listed in 'numerical_features'.
        ('num', numerical_transformer, numerical_features),
        # Apply the 'categorical_transformer' (OneHotEncoder) to the column(s) in 'categorical_features'.
        ('cat', categorical_transformer, categorical_features)
    ],
    # 'remainder='passthrough'' ensures that any columns not specified are kept, not dropped.
    remainder='passthrough'
)

In [None]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nData split into training ({len(X_train)} samples) and testing ({len(X_test)} samples) sets.")


Data split into training (113043 samples) and testing (28261 samples) sets.


In [None]:
#initialize decision tree regressors
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('regressor', DecisionTreeRegressor(random_state=42))])

In [None]:
#grid search CV
dt_param_grid = {
    'regressor__max_depth': [10, 20, 30, None],          # Maximum depth of the tree.
    'regressor__min_samples_split': [2, 5, 10],         # Minimum number of samples required to split a node.
    'regressor__min_samples_leaf': [1, 2, 4]            # Minimum number of samples required at a leaf node.
}

In [None]:
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=5, n_jobs=-1, verbose=1, scoring='r2')

In [None]:
# Start the search process on the training data.
print("Running GridSearchCV for Decision Tree...")
dt_grid_search.fit(X_train, y_train)

Running GridSearchCV for Decision Tree...
Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
# After the search is complete, get the best model found.
best_dt_model = dt_grid_search.best_estimator_
print(f"\nBest hyperparameters for Decision Tree: {dt_grid_search.best_params_}")


Best hyperparameters for Decision Tree: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 10}


In [None]:
# Use the best model to make predictions on the unseen test data.
y_pred_dt = best_dt_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Evaluate the performance of the tuned model.
r2_dt = r2_score(y_test, y_pred_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)

print("\nTuned Decision Tree Model Performance on Test Set:")
print(f"R-squared (R2): {r2_dt:.4f}")
print(f"Mean Absolute Error (MAE): {mae_dt:,.2f}")
print(f"Mean Squared Error (MSE): {mse_dt:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_dt:,.2f}")


Tuned Decision Tree Model Performance on Test Set:
R-squared (R2): 0.9866
Mean Absolute Error (MAE): 2.01
Mean Squared Error (MSE): 46.74
Root Mean Squared Error (RMSE): 6.84


In [None]:
# Create the full pipeline for the KNN model.
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', KNeighborsRegressor())])

In [None]:
# Define the hyperparameter grid for KNN.
knn_param_grid = {
    'regressor__n_neighbors': [3, 5, 7, 9],          # Number of neighbors to use.
    'regressor__weights': ['uniform', 'distance'],  # Weight function used in prediction. 'distance' gives more weight to closer neighbors.
    'regressor__metric': ['euclidean', 'manhattan'] # The distance metric to use.
}

In [None]:
# Create and configure the GridSearchCV object for KNN.
knn_grid_search = GridSearchCV(knn_pipeline, knn_param_grid, cv=5, n_jobs=-1, verbose=1, scoring='r2')

In [None]:
# Start the search process.
print("Running GridSearchCV for KNN...")
knn_grid_search.fit(X_train, y_train)

Running GridSearchCV for KNN...
Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

In [None]:
#as above grid search is taking a lot of time
#we will normally apply knn for the purpose of this code.


In [None]:
# Create a KNN regressor with default parameters
knn_regressor = KNeighborsRegressor()

# Create a pipeline with the preprocessor and the KNN regressor
knn_pipeline_simple = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', knn_regressor)])

# Train the simple KNN model
print("Training simple KNN model...")
knn_pipeline_simple.fit(X_train, y_train)
print("Simple KNN model trained.")

Training simple KNN model...
Simple KNN model trained.


In [None]:
# Make predictions with the simple KNN model
y_pred_knn_simple = knn_pipeline_simple.predict(X_test)

# Evaluate the performance of the simple KNN model
r2_knn_simple = r2_score(y_test, y_pred_knn_simple)
mae_knn_simple = mean_absolute_error(y_test, y_pred_knn_simple)
mse_knn_simple = mean_squared_error(y_test, y_pred_knn_simple)
rmse_knn_simple = np.sqrt(mse_knn_simple)

print("\nSimple KNN Model Performance on Test Set:")
print(f"R-squared (R2): {r2_knn_simple:.4f}")
print(f"Mean Absolute Error (MAE): {mae_knn_simple:,.2f}")
print(f"Mean Squared Error (MSE): {mse_knn_simple:,.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_knn_simple:,.2f}")


Simple KNN Model Performance on Test Set:
R-squared (R2): 0.9737
Mean Absolute Error (MAE): 3.43
Mean Squared Error (MSE): 91.58
Root Mean Squared Error (RMSE): 9.57
