In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import pickle
from category_encoders import TargetEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#load the data
data = pd.read_csv("Hyderbad_House_price.csv")

In [5]:
#remove the duplicates
data = data.drop_duplicates()

In [6]:
#remove "unnamed: 0" column if it exists
if 'Unnamed: 0' in data.columns:
    data = data.drop('Unnamed: 0', axis=1)

In [7]:
#check for null values
null_values = data.isnull().any().any()
if null_values:
    print("There are null values in the dataset.")
else:
    print("There are no null values in the dataset.")

There are no null values in the dataset.


In [8]:
# Basic statistics of the dataset
print("\nDataset Statistics:")
print(data.describe())


Dataset Statistics:
          price(L)   rate_persqft   area_insqft
count  3660.000000    3660.000000   3660.000000
mean    109.624350    5165.003005   2023.506284
std     197.596948    5316.490320   1829.832163
min       1.320000     125.000000    118.000000
25%      26.137500    1555.000000   1280.000000
50%      61.130000    4703.000000   1620.000000
75%     116.000000    6800.000000   2080.000000
max    3600.000000  112474.000000  45000.000000


In [9]:
#Prepare features and target variable
X = data.drop('price(L)', axis=1)
y = data['price(L)']

In [10]:
# Define features
numerical_features = ['rate_persqft', 'area_insqft']
categorical_features = ['title', 'building_status']
location_feature = ['location']

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Define preprocessing for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = TargetEncoder()
location_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('loc', location_transformer, location_feature)
    ])

In [13]:
# Create the model
model = RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42)

In [14]:
# Create and evaluate the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [15]:
# Train the model
pipeline.fit(X_train, y_train)

In [16]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [17]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared score: {r2}')

Mean Absolute Error: 6.848134369407914
Root Mean Squared Error: 46.96065367458436
R-squared score: 0.94191679453818


In [18]:
# Cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f'Cross-validation RMSE scores: {cv_rmse}')
print(f'Mean CV RMSE: {cv_rmse.mean()}')

Cross-validation RMSE scores: [ 21.09486984 134.10391851  24.81413868  99.33147005  75.64898201]
Mean CV RMSE: 70.99867581782041


In [23]:
# Feature Importance
feature_importance = pipeline.named_steps['model'].feature_importances_

# Get feature names
num_features = numerical_features
cat_features = [f"title_{col}" for col in categorical_features] + [f"building_status_{col}" for col in categorical_features]
loc_features = [f"location_{col}" for col in location_feature]

feature_names = num_features + cat_features + loc_features

# Ensure we have the correct number of feature names
if len(feature_names) != len(feature_importance):
    print(f"Warning: Number of feature names ({len(feature_names)}) does not match number of importance scores ({len(feature_importance)})")
    feature_names = [f"feature_{i}" for i in range(len(feature_importance))]

importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(importance_df)


Feature Importance:
     feature  importance
0  feature_0    0.485704
1  feature_1    0.367717
2  feature_2    0.099024
4  feature_4    0.040340
3  feature_3    0.007216


In [24]:
# Visualize Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importance_df.head(10))
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

In [25]:
# Test predictions for different locations
test_data = X_test.iloc[0:1].copy()  # Use the first test sample as a base
print("\nPrediction for different locations:")
for location in data['location'].unique():
    test_data['location'] = location
    pred = pipeline.predict(test_data)
    print(f"{location}: {pred[0]:.2f}")


Prediction for different locations:
Nizampet: 32.99
Bachupally: 49.44
Dundigal: 49.49
Pocharam: 45.32
Kollur: 68.05
Kukatpally: 68.05
Choutuppal: 32.99
Jeedimetla: 68.05
Uppal Kalan: 43.13
Rajendra Nagar: 138.34
Yapral: 133.14
Kompally: 64.71
Patighanpur: 133.14
Tellapur: 138.34
Madeenaguda: 49.49
Shamshabad: 49.43
Puppalaguda: 133.14
Mallampet: 32.99
Meerpet: 49.49
Mallapur: 32.99
Tukkuguda: 32.98
Kondapur: 137.50
Chandanagar: 32.84
Saroor Nagar: 49.24
Sainikpuri: 133.14
Bolarum: 32.84
Gagillapur: 119.89
Manikonda: 97.58
Jubilee Hills: 234.99
Dammaiguda: 113.35
Boduppal: 43.13
Amberpet: 45.92
Old Bowenpally: 64.71
Gachibowli: 133.14
AS Rao Nagar: 49.24
Kapra: 68.05
Chengicherla: 45.32
Bhanur: 60.51
Adibatla: 32.98
Kothapet: 32.99
Rampally: 49.49
Sadashivpet: 32.76
Bairagiguda: 113.76
Bahadurpally: 49.44
East Marredpally: 68.05
Bandlaguda Jagir: 113.35
Isnapur: 32.89
Shankarpalli: 45.32
muthangi: 49.49
Rai Durg: 139.17
Taramatipet: 32.89
Bacharam: 49.44
Nagaram: 45.30
Toli Chowki: 126

In [26]:
# Save the model
model_path = 'house_price_model.pkl'
joblib.dump(pipeline, model_path)
print(f"Model saved to {model_path}")

Model saved to house_price_model.pkl


In [27]:
# Also save as model.pkl for consistency with your Flask app
pickle.dump(pipeline, open('model.pkl', 'wb'))
print("Model also saved as model.pkl")
print("\nModel trained and saved successfully.")

Model also saved as model.pkl

Model trained and saved successfully.


In [28]:
# Get unique locations and sort them
unique_locations = sorted(data['location'].unique())

# Print the number of unique locations
print(f"Number of unique locations: {len(unique_locations)}")

# Print all unique locations
print("Unique locations:")
for location in unique_locations:
    print(location)

Number of unique locations: 359
Unique locations:
7 Tombs Road
AS Rao Nagar
Abdullapurmet
Abids
Adibatla
Adikmet
Ahilya Nagar
Ahmed Colony Hyderabad
Ahmed Nagar
Alair
Alkapur township
Almasguda
Alwal
Amangal
Amberpet
Ameenpur
Ameerpet
Aminpur
Annojiguda
Appa Junction
Appa Junction Peerancheru
Aremaishamma
Aroor
Ashok Nagar
Attapur
Auto Nagar
BB Nagar
BHEL Employees Co operative Housing Society
Bacharam
Bachupally
Badangpet
Bahadurpally
Bairagiguda
Balanagar
Balapur
Bandaraviryal
Bandlaguda Jagir
Banjara Hills
Basheer Bagh
Batasingaram
Beeramguda
Begum Bazar Chatri
Begumpet
Begumpet Flyover
Bhanur
Bhanur village
Bhongir
Bhuvanagiri
Bibinagar
Boduppal
Boduppal Dwaraka Nagar
Boduppal Road
Boiguda
Bolarum
Bollaram Industrial Area
Bowenpally
Bowrampet
Brindavan Colony
Bundalguda
Bureddipalle
Chandanagar
Chegunta
Chengicherla
Chennaram
Cherlapalli
Chevella
Chikkadapally
Chilkanagar
Choutuppal
DK Enclave
DK Road
DLF Cyber City
Dammaiguda
Deluxe Colony
Dhobipet
Diamond Hill Colony
Diamond Poin