In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

import joblib

print("Libraries imported successfully!")


Libraries imported successfully!


In [2]:
df = pd.read_csv("../data/cleaned_data.csv")
df.head()


Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status,Amenity_Count,Good_Investment
0,1,7,38,427,0,1,4740,489.76,10332.489451,1990,...,3,High,No,0,"Playground, Gym, Garden, Pool, Clubhouse",3,2,0,5,0
1,2,3,27,373,1,3,2364,195.52,8270.72758,2008,...,1,Low,No,1,"Playground, Clubhouse, Pool, Gym, Garden",1,1,1,5,1
2,3,5,18,473,0,2,3642,183.79,5046.403075,1997,...,8,Low,Yes,0,"Clubhouse, Pool, Playground, Gym",2,0,0,4,1
3,4,6,14,253,1,2,2741,300.29,10955.490697,1991,...,7,High,Yes,1,"Playground, Clubhouse, Gym, Pool, Garden",1,1,0,5,0
4,5,6,11,343,2,4,4823,182.9,3792.24549,2002,...,9,Low,No,1,"Playground, Garden, Gym, Pool, Clubhouse",0,1,0,5,1


In [3]:
label = LabelEncoder()

for col in df.select_dtypes(include=['object']).columns:
    df[col] = label.fit_transform(df[col].astype(str))

print("All categorical columns encoded successfully!")


All categorical columns encoded successfully!


In [4]:
X_class = df.drop(["Good_Investment", "Future_Price_5Y"], axis=1, errors="ignore")
y_class = df["Good_Investment"]

X_train, X_test, y_train, y_test = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

print("Classification data prepared!")


Classification data prepared!


In [5]:
log_clf = LogisticRegression(max_iter=500)
log_clf.fit(X_train, y_train)

log_pred = log_clf.predict(X_test)

print("\nLOGISTIC REGRESSION")
print("Accuracy:", accuracy_score(y_test, log_pred))
print(classification_report(y_test, log_pred))



LOGISTIC REGRESSION
Accuracy: 0.99994
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25053
           1       1.00      1.00      1.00     24947

    accuracy                           1.00     50000
   macro avg       1.00      1.00      1.00     50000
weighted avg       1.00      1.00      1.00     50000



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)

rf_pred = rf_clf.predict(X_test)

print("\nRANDOM FOREST CLASSIFIER")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))



RANDOM FOREST CLASSIFIER
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25053
           1       1.00      1.00      1.00     24947

    accuracy                           1.00     50000
   macro avg       1.00      1.00      1.00     50000
weighted avg       1.00      1.00      1.00     50000



In [7]:
joblib.dump(rf_clf, "../models/classifier.pkl")
print("Classification model saved!")


Classification model saved!


In [8]:
# Create Future Price Column (8% annual growth for 5 years)
df["Future_Price_5Y"] = df["Price_in_Lakhs"] * (1.08 ** 5)

print("Future_Price_5Y column created!")
df[["Price_in_Lakhs", "Future_Price_5Y"]].head()


Future_Price_5Y column created!


Unnamed: 0,Price_in_Lakhs,Future_Price_5Y
0,489.76,719.618119
1,195.52,287.283026
2,183.79,270.047807
3,300.29,441.224528
4,182.9,268.740105


In [9]:
X_reg = df.drop(["Future_Price_5Y"], axis=1)
y_reg = df["Future_Price_5Y"]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print("Regression data prepared!")


Regression data prepared!


In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_r, y_train_r)

# Predictions
y_pred_lin = lin_reg.predict(X_test_r)

# Evaluation
mse_lin = mean_squared_error(y_test_r, y_pred_lin)
r2_lin = r2_score(y_test_r, y_pred_lin)

print("ðŸ“Œ LINEAR REGRESSION RESULTS")
print("MSE:", mse_lin)
print("R2 Score:", r2_lin)


ðŸ“Œ LINEAR REGRESSION RESULTS
MSE: 5.661004598267055e-23
R2 Score: 1.0


In [11]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train_r, y_train_r)

# Predictions
y_pred_rf = rf_reg.predict(X_test_r)

# Evaluation
mse_rf = mean_squared_error(y_test_r, y_pred_rf)
r2_rf = r2_score(y_test_r, y_pred_rf)

print("\nðŸŒ³ RANDOM FOREST REGRESSION RESULTS")
print("MSE:", mse_rf)
print("R2 Score:", r2_rf)



ðŸŒ³ RANDOM FOREST REGRESSION RESULTS
MSE: 1.1132447956749065e-05
R2 Score: 0.999999999741342


In [12]:
joblib.dump(lin_reg, "../models/linear_regression.pkl")
joblib.dump(rf_reg, "../models/rf_regression.pkl")

print("Regression models saved!")


Regression models saved!


In [13]:
# ----- ENSEMBLE REGRESSOR (Linear + Random Forest Average) -----

# Predictions from already-trained models
y_pred_lin = lin_reg.predict(X_test_r)
y_pred_rf = rf_reg.predict(X_test_r)

# Average predictions
y_pred_ensemble = (y_pred_lin + y_pred_rf) / 2

# Evaluate
mse_ensemble = mean_squared_error(y_test_r, y_pred_ensemble)
r2_ensemble = r2_score(y_test_r, y_pred_ensemble)

print("\nðŸ”· ENSEMBLE REGRESSION RESULTS (Linear + RF)")
print("MSE:", mse_ensemble)
print("R2 Score:", r2_ensemble)

# Save ensemble predictions model as a simple object
ensemble_model = {
    "lin_reg": lin_reg,
    "rf_reg": rf_reg
}

joblib.dump(ensemble_model, "../models/ensemble_regression.pkl")
print("Ensemble regression model saved!")



ðŸ”· ENSEMBLE REGRESSION RESULTS (Linear + RF)
MSE: 2.783111989238128e-06
R2 Score: 0.9999999999353355
Ensemble regression model saved!
