In [12]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression # Or RandomForestRegressor, XGBoostRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import pickle
import os

print("Libraries imported successfully.")


Libraries imported successfully.


In [5]:
# Cell 2: Define Data Paths and Load Data
# IMPORTANT: Adjust 'DATA_PATH' if your CSV is in a different location
DATA_PATH = './residential_property_prices.csv'

try:
    # Try 'latin1' first, it's a common fallback. If that doesn't work, try 'cp1252'.
    df = pd.read_csv(DATA_PATH, encoding='latin1')
    # If latin1 fails, try:
    # df = pd.read_csv(DATA_PATH, encoding='cp1252')

    print(f"Data loaded successfully from {DATA_PATH}. Shape: {df.shape}")
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_PATH}. Please ensure the CSV is in the correct directory.")
    # Exit or provide a dummy DataFrame for testing if crucial
except UnicodeDecodeError as e:
    print(f"UnicodeDecodeError: Failed to read CSV with specified encoding. Try a different encoding. Error: {e}")
    # Consider manually inspecting the file encoding if multiple attempts fail
    exit() # Or handle the error differently

Data loaded successfully from ./residential_property_prices.csv. Shape: (332096, 32)

First 5 rows of the dataset:
               Property_Name  Property_id Property_type     Property_status  \
0               Arkiton Luxe     15446514     Apartment  Under Construction   
1  Keshav Akshar Ocean Pearl     15367414     Apartment  Under Construction   
2            Vishwa Opulence     14683118     Apartment       Ready to move   
3              Satyam Sarjan      5476295     Apartment       Ready to move   
4           Navkar Sunflower     15477040     Apartment  Under Construction   

  Price_per_unit_area   Posted_On  \
0               4,285   1 day ago   
1               7,000  2 days ago   
2               5,752  2 days ago   
3               2,486  5 days ago   
4               5,324  8 days ago   

                                         Project_URL   builder_id  \
0  https://www.makaan.com/ahmedabad/arkiton-life-...  100563465.0   
1  https://www.makaan.com/ahmedabad/keshav-naraya

In [7]:
# Cell 3: Initial Data Info and Summary
print("\nDataset Info:")
df.info()

print("\nDescriptive Statistics (Numerical Columns):")
print(df.describe())

print("\nMissing values per column (Top 10 with most NaNs):")
print(df.isnull().sum().sort_values(ascending=False).head(10))

print("\nUnique values in key categorical columns:")
# Check 'Property_type', 'Property_status', 'City', 'Furnishing_status', 'Listing_Category'
for col in ['Property_type', 'Property_status', 'City', 'Furnishing_status', 'Listing_Category']:
    if col in df.columns:
        print(f"- {col}: {df[col].nunique()} unique values")
        print(df[col].value_counts().head()) # Show top values


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 332096 entries, 0 to 332095
Data columns (total 32 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Property_Name             217826 non-null  object 
 1   Property_id               332096 non-null  int64  
 2   Property_type             332096 non-null  object 
 3   Property_status           271654 non-null  object 
 4   Price_per_unit_area       332096 non-null  object 
 5   Posted_On                 332096 non-null  object 
 6   Project_URL               332096 non-null  object 
 7   builder_id                149978 non-null  float64
 8   Builder_name              149978 non-null  object 
 9   Property_building_status  332096 non-null  object 
 10  City_id                   332096 non-null  int64  
 11  City_name                 332096 non-null  object 
 12  No_of_BHK                 332096 non-null  object 
 13  Locality_ID               332

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# --- 4.1: Feature Selection and Renaming ---

# Define new mapping for your dataset
features_raw = ['City_name', 'No_of_BHK', 'Size', 'is_furnished', 'Property_type', 'Price']
mapped_columns = {
    'City_name': 'City',
    'No_of_BHK': 'BHK',
    'Size': 'Super_Builtup_Area',
    'is_furnished': 'Furnishing_status',
    'Property_type': 'Property_type',
    'Price': 'Price_in_cr'  # Will convert to crore as float
}

# Ensure existence of all necessary columns
for col in features_raw:
    if col not in df.columns:
        print(f"Error: Column '{col}' not found!")
        exit()

# Rename columns for simplicity
df_selected = df[features_raw].rename(columns=mapped_columns).copy()
print(f"Columns after renaming: {df_selected.columns.tolist()}")

# --- 4.2: Data Cleaning ---

# 1. Clean BHK: Convert to int
df_selected['BHK'] = pd.to_numeric(df_selected['BHK'], errors='coerce')

# 2. Clean Super_Builtup_Area: Extract numeric value from "1200 sqft" etc.
df_selected['Super_Builtup_Area'] = (
    df_selected['Super_Builtup_Area']
    .str.extract(r'(\d+\.?\d*)')    # Extract numeric part
    .astype(float)
)

# 3. Clean Price_in_cr: Convert '65 L', '1.2 Cr', etc. into float (crores)
def price_to_crore(value):
    value = str(value).strip().replace(',', '')  # remove potential commas
    if 'Cr' in value:
        return float(value.replace('Cr', '').strip())
    elif 'L' in value:
        return float(value.replace('L', '').strip()) / 100
    else:
        try:
            return float(value) / 100  # Assume Lacs by default
        except:
            return None

df_selected['Price_in_cr'] = df_selected['Price_in_cr'].apply(price_to_crore)

# --- 4.3: Drop NA rows in essential columns ---
df_cleaned = df_selected.dropna().copy()
print(f"After cleaning: {df_cleaned.shape}")

# --- 4.4: Feature Encoding ---

# City Encoding
location_encoder = LabelEncoder()
df_cleaned['City_Encoded'] = location_encoder.fit_transform(df_cleaned['City'])

# Furnishing_status Encoding
furnishing_encoder = LabelEncoder()
df_cleaned['Furnishing_status_Encoded'] = furnishing_encoder.fit_transform(df_cleaned['Furnishing_status'])

# One-hot encode Property_type (drop_first=True to avoid redundancy)
df_cleaned = pd.get_dummies(df_cleaned, columns=['Property_type'], prefix='PropertyType', drop_first=True)

# --- 4.5: FINAL FEATURE SET (X) AND TARGET (y) ---

final_features = [
    'City_Encoded',
    'BHK',
    'Super_Builtup_Area',
    'Furnishing_status_Encoded',
]
property_type_cols = [col for col in df_cleaned.columns if col.startswith('PropertyType_')]
final_features.extend(property_type_cols)

X = df_cleaned[final_features]
y = df_cleaned['Price_in_cr'] * 1e7  # Back to actual price in INR for ML training

print("\nFinal features (X) used for training:")
print(X.columns.tolist())
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")


Columns after renaming: ['City', 'BHK', 'Super_Builtup_Area', 'Furnishing_status', 'Property_type', 'Price_in_cr']
After cleaning: (0, 6)

Final features (X) used for training:
['City_Encoded', 'BHK', 'Super_Builtup_Area', 'Furnishing_status_Encoded']
Shape of X: (0, 4), Shape of y: (0,)


In [9]:
# Cell 4: Data Cleaning and Feature Engineering (Corrected - Removed 'Bathroom' column)
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# --- 4.1: Feature Selection and Renaming ---
# Based on your df.info() and df.head() output.
features_raw = [
    'City_name',
    'No_of_BHK',
    'Size',
    # 'Bathroom', # REMOVED: This column is not in your dataset as confirmed
    'is_furnished',
    'Property_type',
    'Price'              # This is your target column
]

# Ensure existence of all necessary columns
for col in features_raw:
    if col not in df.columns:
        print(f"Error: Column '{col}' not found in the dataset! Please check your CSV column names.")
        exit()

# Create a copy to avoid SettingWithCopyWarning
df_cleaned = df[features_raw].copy()

# Rename columns for simplicity and consistency with common ML terms
df_cleaned.rename(columns={
    'City_name': 'City',
    'No_of_BHK': 'BHK',
    'Size': 'Super_Builtup_Area',
    'is_furnished': 'Furnishing_status',
    'Price': 'Original_Price_Raw' # Keep original Price raw for reference during cleaning
}, inplace=True)

print(f"DataFrame shape after selecting and initial renaming: {df_cleaned.shape}")
print("Columns after initial renaming:")
print(df_cleaned.columns.tolist())


# --- 4.2: Data Cleaning and Type Conversion ---

# 1. Clean 'Price' column (Object type -> Numeric)
def clean_price(price_str):
    price_str = str(price_str).strip().replace('â‚¹', '').replace(',', '').lower()
    if 'cr' in price_str:
        return float(price_str.replace('cr', '').strip()) * 10000000
    elif 'l' in price_str or 'lac' in price_str:
        return float(price_str.replace('l', '').replace('ac', '').strip()) * 100000
    elif 'k' in price_str:
        return float(price_str.replace('k', '').strip()) * 1000
    else:
        try:
            return float(price_str)
        except ValueError:
            return np.nan

df_cleaned['Price_Cleaned_INR'] = df_cleaned['Original_Price_Raw'].apply(clean_price)
print("\n'Price_Cleaned_INR' created (original vs cleaned):")
print(df_cleaned[['Original_Price_Raw', 'Price_Cleaned_INR']].head())


# 2. Clean 'Super_Builtup_Area' (Object type -> Numeric)
def clean_area_sqft(area_str):
    area_str = str(area_str).strip().lower().replace('sq.ft', '').replace('sqft', '').replace('sq ft', '').replace(',', '')
    try:
        return float(area_str)
    except ValueError:
        return np.nan

df_cleaned['Super_Builtup_Area_Cleaned'] = df_cleaned['Super_Builtup_Area'].apply(clean_area_sqft)
print("\n'Super_Builtup_Area_Cleaned' created (original vs cleaned):")
print(df_cleaned[['Super_Builtup_Area', 'Super_Builtup_Area_Cleaned']].head())


# 3. Clean 'BHK' (Object type -> Numeric)
def clean_bhk_numeric(bhk_str):
    bhk_str = str(bhk_str).strip().lower()
    if 'rk' in bhk_str:
        return 0.5
    try:
        return int(float(bhk_str.replace('bhk', '').strip()))
    except ValueError:
        return np.nan

df_cleaned['BHK_Cleaned'] = df_cleaned['BHK'].apply(clean_bhk_numeric)
print("\n'BHK_Cleaned' created (original vs cleaned):")
print(df_cleaned[['BHK', 'BHK_Cleaned']].head())

# --- 4.3: Handle NaNs in crucial cleaned columns ---
# Drop rows where cleaning resulted in NaN in our critical features or target
# Removed 'Bathroom' from this subset as well
df_cleaned.dropna(subset=['Price_Cleaned_INR', 'Super_Builtup_Area_Cleaned', 'BHK_Cleaned'], inplace=True)

# Convert to appropriate integer types where applicable after dropping NaNs
df_cleaned['BHK_Cleaned'] = df_cleaned['BHK_Cleaned'].astype(int)
# Removed: df_cleaned['Bathroom'] = df_cleaned['Bathroom'].astype(int)


print(f"\nDataFrame shape after cleaning and dropping NaNs: {df_cleaned.shape}")

# --- 4.4: Feature Encoding ---

# City Encoding (LabelEncoder)
location_encoder = LabelEncoder()
df_cleaned['City_Encoded'] = location_encoder.fit_transform(df_cleaned['City'])
print("\nCity Encoding classes (first 5):")
if len(location_encoder.classes_) > 5:
    print(location_encoder.classes_[:5])
else:
    print(location_encoder.classes_)


# Furnishing_status Encoding (LabelEncoder)
furnishing_encoder = LabelEncoder()
df_cleaned['Furnishing_status_Encoded'] = furnishing_encoder.fit_transform(df_cleaned['Furnishing_status'])
print("\nFurnishing Status Encoding classes:")
print(furnishing_encoder.classes_)

# Property_type (One-Hot Encoding)
df_cleaned = pd.get_dummies(df_cleaned, columns=['Property_type'], prefix='PropertyType', drop_first=True)
print("\nAfter One-Hot Encoding Property_type (first 5 rows with new columns):")
dummy_cols_sample = [col for col in df_cleaned.columns if col.startswith('PropertyType_')]
if len(dummy_cols_sample) > 5:
    print(df_cleaned[dummy_cols_sample[:5]].head())
else:
    print(df_cleaned[dummy_cols_sample].head())


# --- 4.5: FINAL FEATURE SET (X) AND TARGET (y) ---

# Define the final features for X based on cleaned and encoded DataFrame
final_features = [
    'City_Encoded',
    'BHK_Cleaned',
    'Super_Builtup_Area_Cleaned', # Use the cleaned Size column
    # 'Bathroom', # REMOVED: This column is not in your dataset
    'Furnishing_status_Encoded',
]
# Add dynamically created one-hot encoded columns for Property_type
property_type_cols = [col for col in df_cleaned.columns if col.startswith('PropertyType_')]
final_features.extend(property_type_cols)

# Ensure all final features actually exist in the DataFrame before selecting X
final_features = [f for f in final_features if f in df_cleaned.columns]

X = df_cleaned[final_features]
y = df_cleaned['Price_Cleaned_INR'] # Use the cleaned price as target

print("\nFinal features (X) used for training (and expected in Django prediction):")
print(X.columns.tolist())
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

# Store the final feature column names for prediction consistency
# This list will be saved and used in the Django app
model_feature_columns = X.columns.tolist()
print("\nModel expects features in this order:", model_feature_columns)  #cell4

DataFrame shape after selecting and initial renaming: (332096, 6)
Columns after initial renaming:
['City', 'BHK', 'Super_Builtup_Area', 'Furnishing_status', 'Property_type', 'Original_Price_Raw']

'Price_Cleaned_INR' created (original vs cleaned):
  Original_Price_Raw  Price_Cleaned_INR
0          75,00,000          7500000.0
1        2,36,88,000         23688000.0
2        1,32,00,840         13200840.0
3          22,83,000          2283000.0
4          93,71,000          9371000.0

'Super_Builtup_Area_Cleaned' created (original vs cleaned):
  Super_Builtup_Area  Super_Builtup_Area_Cleaned
0        1,750 sq ft                      1750.0
1        3,384 sq ft                      3384.0
2        2,295 sq ft                      2295.0
3          918 sq ft                       918.0
4        1,760 sq ft                      1760.0

'BHK_Cleaned' created (original vs cleaned):
     BHK  BHK_Cleaned
0  3 BHK          3.0
1  4 BHK          4.0
2  3 BHK          3.0
3  2 BHK          2.0
4

In [13]:
# Cell 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set shape: {X_train.shape}, Testing set shape: {X_test.shape}")


Training set shape: (265676, 8), Testing set shape: (66420, 8)


In [14]:
# Cell 6: Model Training
from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression # Uncomment if you want to try Linear Regression

model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
# model = LinearRegression() # If you prefer a simpler model

print(f"\nTraining {type(model).__name__} model...")
model.fit(X_train, y_train)
print("Model training complete.")


Training RandomForestRegressor model...
Model training complete.


In [15]:
# Cell 7: Model Evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation ({type(model).__name__}):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R2): {r2:.2f}")


Model Evaluation (RandomForestRegressor):
Mean Absolute Error (MAE): 5756332.64
R-squared (R2): 0.71


In [16]:
# Cell 8: Save Model and Encoders
ARTIFACTS_DIR = 'model_artifacts' # Ensure this matches your folder name
if not os.path.exists(ARTIFACTS_DIR):
    os.makedirs(ARTIFACTS_DIR)

MODEL_PATH = os.path.join(ARTIFACTS_DIR, 'price_prediction_model.pkl')
LOCATION_ENCODER_PATH = os.path.join(ARTIFACTS_DIR, 'location_encoder.pkl')
FURNISHING_ENCODER_PATH = os.path.join(ARTIFACTS_DIR, 'furnishing_encoder.pkl')
ONE_HOT_COLS_PATH = os.path.join(ARTIFACTS_DIR, 'one_hot_cols.pkl')

with open(MODEL_PATH, 'wb') as model_file:
    pickle.dump(model, model_file)
print(f"\nModel saved to: {MODEL_PATH}")

if location_encoder:
    with open(LOCATION_ENCODER_PATH, 'wb') as encoder_file:
        pickle.dump(location_encoder, encoder_file)
    print(f"Location LabelEncoder saved to: {LOCATION_ENCODER_PATH}")

if furnishing_encoder:
    with open(FURNISHING_ENCODER_PATH, 'wb') as encoder_file:
        pickle.dump(furnishing_encoder, encoder_file)
    print(f"Furnishing LabelEncoder saved to: {FURNISHING_ENCODER_PATH}")

with open(ONE_HOT_COLS_PATH, 'wb') as cols_file:
    pickle.dump(model_feature_columns, cols_file) # Using the model_feature_columns from Cell 4
print(f"One-hot encoded columns list saved to: {ONE_HOT_COLS_PATH}")

print("\nTraining and artifact saving process complete. You can now use these .pkl files in your Django app.")


Model saved to: model_artifacts\price_prediction_model.pkl
Location LabelEncoder saved to: model_artifacts\location_encoder.pkl
Furnishing LabelEncoder saved to: model_artifacts\furnishing_encoder.pkl
One-hot encoded columns list saved to: model_artifacts\one_hot_cols.pkl

Training and artifact saving process complete. You can now use these .pkl files in your Django app.
