In [None]:
import pandas as pd
import glob
import os

# Read individual CSV files and store them in a list
file_paths = glob.glob(r"D:\Housing Prices Prediction\All Metropolitan Cities\*.csv")
dfs = []
city_names = []
for file_path in file_paths:
    city_name = os.path.splitext(os.path.basename(file_path))[0]
    df = pd.read_csv(file_path)
    dfs.append(df)
    city_names.extend([city_name] * len(df))

# Combine all dataframes and city names into one dataframe
combined_df = pd.concat(dfs, ignore_index=True)
combined_df["City"] = city_names

# Save the combined dataframe to a new CSV file
combined_df.to_csv("housing_dataset.csv", index=False)



In [None]:
import pandas as pd
import glob
import os
import random

# Read individual CSV files and store them in a list
file_paths = glob.glob(r"D:\Housing Prices Prediction\All Metropolitan Cities\*.csv")
dfs = []
city_names = []
for file_path in file_paths:
    city_name = os.path.splitext(os.path.basename(file_path))[0]
    df = pd.read_csv(file_path)
    dfs.append(df)
    city_names.extend([city_name] * len(df))

# Combine all dataframes and city names into one dataframe
combined_df = pd.concat(dfs, ignore_index=True)
combined_df["City"] = city_names

# Select 16,000 random data points
random_df = combined_df.sample(n=16000, random_state=42)

# Save the randomly selected dataframe to a new CSV file
random_df.to_csv("random_housing_dataset.csv", index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

# Read the dataset
df = pd.read_csv("housing_dataset.csv")

# Split the data into features (X) and target variable (y)
X = df[['City', 'Area']]
y = df['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess categorical columns using one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(drop='first'), [0])],
    remainder='passthrough'
)

# Create a pipeline with preprocessing and random forest regressor
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(random_state=42))])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict house prices on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model (you can use different evaluation metrics)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)

# Make a sample prediction
sample_data = pd.DataFrame({'City': ['Bangalore'], 'Area': [1500]})
predicted_price = pipeline.predict(sample_data)
print('Predicted Price:', predicted_price[0])



In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
df = pd.read_csv('housing_dataset.csv')

# Select the desired features and target variable
features = ['Area', 'Location', 'No. of Bedrooms', 'Resale', 'MaintenanceStaff', 'Gymnasium',
            'SwimmingPool', 'LandscapedGardens', 'JoggingTrack', 'IndoorGames', 'SportsFacility',
            'CarParking', 'City']
target = 'Price'

# Prepare the data
X = df[features]
y = df[target]

# Convert categorical variables 'Location' and 'City' into dummy/indicator variables
X = pd.get_dummies(X, columns=['Location', 'City'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error:', mae)


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from geopy.geocoders import Nominatim
from math import radians, sin, cos, sqrt, atan2

# Step 1: Preprocess the data
df = pd.read_csv('housing_dataset.csv')
df = df.dropna()  # Dropping rows with missing values

# Step 2: Geocode locations and calculate distances
geolocator = Nominatim(user_agent="your_app_name")

def get_coordinates(location):
    location_info = geolocator.geocode(location)
    if location_info:
        lat = location_info.latitude
        lon = location_info.longitude
        return lat, lon
    else:
        return None

# Get latitude and longitude for each location
df[['Latitude', 'Longitude']] = df['Location'].apply(get_coordinates).apply(pd.Series)

# Function to calculate distance using haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2) ** 2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# Calculate distances between locations
df['Distance'] = df.apply(
    lambda row: calculate_distance(row['Latitude'], row['Longitude'], row['Latitude'], row['Longitude']), axis=1
)

# Step 3: Perform clustering based on city
scaler = StandardScaler()
df_scaled_city = scaler.fit_transform(df[['Latitude', 'Longitude']])
kmeans_city = KMeans(n_clusters=3)  # Replace '3' with the desired number of clusters for cities
city_clusters = kmeans_city.fit_predict(df_scaled_city)

# Step 4: Perform clustering based on location within each city
df['CityCluster'] = city_clusters  # Add city cluster labels to the dataframe
df_clusters = []

for city_cluster_id in range(len(kmeans_city.cluster_centers_)):
    df_city_cluster = df[df['CityCluster'] == city_cluster_id]
    df_scaled_location = scaler.fit_transform(df_city_cluster[['Distance']])
    kmeans_location = KMeans(n_clusters=4)  # Replace '4' with the desired number of clusters for locations within each city
    location_clusters = kmeans_location.fit_predict(df_scaled_location)
    df_city_cluster['LocationCluster'] = location_clusters
    df_clusters.append(df_city_cluster)

# Step 5: Prepare the feature set for regression analysis
features = [
    'No. of Bedrooms', 'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool', 'LandscapedGardens',
    'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames', 'ShoppingMall', 'Intercom', 'SportsFacility',
    'ATM', 'ClubHouse', 'School', '24X7Security', 'PowerBackup', 'CarParking'
]

# Remove 'Wifi' and 'Wardrobe' from features list
features = [feature for feature in features if feature not in ['Wifi', 'Wardrobe']]

# Step 6: Perform regression analysis for each cluster
for df_cluster in df_clusters:
    X = df_cluster[features]
    y = df_cluster['Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    y_pred = regression_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Cluster Mean Squared Error: {mse}")


In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from geopy.geocoders import Nominatim
from math import radians, sin, cos, sqrt, atan2
import time

# Step 1: Preprocess the data
df = pd.read_csv('housing_dataset.csv')
df = df.dropna()  # Dropping rows with missing values

# Step 2: Geocode locations and calculate distances
geolocator = Nominatim(user_agent="your_app_name")

def get_coordinates(location):
    max_retries = 5  # Maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            location_info = geolocator.geocode(location)
            if location_info:
                lat = location_info.latitude
                lon = location_info.longitude
                return lat, lon
            else:
                return None
        except Exception as e:
            print(f"Geocoding failed for location: {location}. Retrying...")
            retries += 1
            time.sleep(1)  # Wait for 1 second before retrying
    return None

# Get latitude and longitude for each location
df[['Latitude', 'Longitude']] = df['Location'].apply(get_coordinates).apply(pd.Series)

# Function to calculate distance using haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2) ** 2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# Calculate distances between locations
df['Distance'] = df.apply(
    lambda row: calculate_distance(row['Latitude'], row['Longitude'], row['Latitude'], row['Longitude']), axis=1
)
# Step 3: Perform clustering based on city
scaler = StandardScaler()
df_scaled_city = scaler.fit_transform(df[['Latitude', 'Longitude']])
kmeans_city = KMeans(n_clusters=3)  # Replace '3' with the desired number of clusters for cities
city_clusters = kmeans_city.fit_predict(df_scaled_city)

# Step 4: Perform clustering based on location within each city
df['CityCluster'] = city_clusters  # Add city cluster labels to the dataframe
df_clusters = []

for city_cluster_id in range(len(kmeans_city.cluster_centers_)):
    df_city_cluster = df[df['CityCluster'] == city_cluster_id]
    df_scaled_location = scaler.fit_transform(df_city_cluster[['Distance']])
    kmeans_location = KMeans(n_clusters=4)  # Replace '4' with the desired number of clusters for locations within each city
    location_clusters = kmeans_location.fit_predict(df_scaled_location)
    df_city_cluster['LocationCluster'] = location_clusters
    df_clusters.append(df_city_cluster)

# Step 5: Prepare the feature set for regression analysis
features = [
    'No. of Bedrooms', 'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool', 'LandscapedGardens',
    'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames', 'ShoppingMall', 'Intercom', 'SportsFacility',
    'ATM', 'ClubHouse', 'School', '24X7Security', 'PowerBackup', 'CarParking'
]

# Remove 'Wifi' and 'Wardrobe' from features list
features = [feature for feature in features if feature not in ['Wifi', 'Wardrobe']]

# Step 6: Perform regression analysis for each cluster
for df_cluster in df_clusters:
    X = df_cluster[features]
    y = df_cluster['Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    y_pred = regression_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Cluster Mean Squared Error: {mse}")



In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import urllib3
import json
from math import radians, sin, cos, sqrt, atan2
import time
from urllib.parse import quote

# Step 1: Preprocess the data
df = pd.read_csv('housing_dataset.csv')
df = df.dropna()  # Dropping rows with missing values

# Step 2: Geocode locations and calculate distances
http = urllib3.PoolManager(1, headers={'user-agent': 'my-test-app'})

def get_coordinates(location):
    max_retries = 1  # Maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            encoded_location = quote(location)
            url = f"https://nominatim.openstreetmap.org/search?q={encoded_location}&format=json&limit=1"
            resp = http.request('GET', url)
            if resp.status == 200:
                location_info = json.loads(resp.data.decode())
                if location_info:
                    lat = float(location_info[0]['lat'])
                    lon = float(location_info[0]['lon'])
                    return lat, lon
                else:
                    return None
            else:
                print(f"Geocoding failed for location: {location}. Status code: {resp.status}")
                retries += 1
                time.sleep(1)  # Wait for 1 second before retrying
        except Exception as e:
            print(f"Geocoding failed for location: {location}. Error: {e}")
            retries += 1
            time.sleep(1)  # Wait for 1 second before retrying
    return None

# Get latitude and longitude for each location
df[['Latitude', 'Longitude']] = df['Location'].apply(get_coordinates).apply(pd.Series)

# Function to calculate distance using haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2) ** 2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# Calculate distances between locations
df['Distance'] = df.apply(
    lambda row: calculate_distance(row['Latitude'], row['Longitude'], row['Latitude'], row['Longitude']), axis=1
)

# Step 3: Perform clustering based on city
scaler = StandardScaler()
df_scaled_city = scaler.fit_transform(df[['Latitude', 'Longitude']])
kmeans_city = KMeans(n_clusters=3)  # Replace '3' with the desired number of clusters for cities
city_clusters = kmeans_city.fit_predict(df_scaled_city)

# Step 4: Perform clustering based on location within each city
df['CityCluster'] = city_clusters  # Add city cluster labels to the dataframe
df_clusters = []

for city_cluster_id in range(len(kmeans_city.cluster_centers_)):
    df_city_cluster = df[df['CityCluster'] == city_cluster_id]
    df_scaled_location = scaler.fit_transform(df_city_cluster[['Distance']])
    kmeans_location = KMeans(n_clusters=4)  # Replace '4' with the desired number of clusters for locations within each city
    location_clusters = kmeans_location.fit_predict(df_scaled_location)
    df_city_cluster['LocationCluster'] = location_clusters
    df_clusters.append(df_city_cluster)

# Step 5: Prepare the feature set for regression analysis
features = [
    'No. of Bedrooms', 'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool', 'LandscapedGardens',
    'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames', 'ShoppingMall', 'Intercom', 'SportsFacility',
    'ATM', 'ClubHouse', 'School', '24X7Security', 'PowerBackup', 'CarParking'
]

# Remove 'Wifi' and 'Wardrobe' from features list
features = [feature for feature in features if feature not in ['Wifi', 'Wardrobe']]

# Step 6: Perform regression analysis for each cluster
for df_cluster in df_clusters:
    X = df_cluster[features]
    y = df_cluster['Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    y_pred = regression_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Cluster Mean Squared Error: {mse}")

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import urllib3
import json
from math import radians, sin, cos, sqrt, atan2
import time
from urllib.parse import quote

# Step 1: Preprocess the data
df = pd.read_csv('housing_dataset.csv')
df = df.dropna()  # Dropping rows with missing values

# Step 2: Geocode locations and calculate distances
http = urllib3.PoolManager(1, headers={'user-agent': 'my-test-app'})

def get_coordinates(location):
    max_retries = 3  # Maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            encoded_location = quote(location)
            url = f"https://nominatim.openstreetmap.org/search?q={encoded_location}&format=json&limit=1"
            print(f"Geocoding request for location: {location}")
            resp = http.request('GET', url, timeout=5.0)  # Set timeout value to 5 seconds
            if resp.status == 200:
                location_info = json.loads(resp.data.decode())
                if location_info:
                    lat = float(location_info[0]['lat'])
                    lon = float(location_info[0]['lon'])
                    print(f"Geocoding successful for location: {location}. Latitude: {lat}, Longitude: {lon}")
                    return lat, lon
                else:
                    print(f"No geocoding results found for location: {location}")
                    return None
            else:
                print(f"Geocoding failed for location: {location}. Status code: {resp.status}")
                retries += 1
                time.sleep(1)  # Wait for 1 second before retrying
        except Exception as e:
            print(f"Geocoding failed for location: {location}. Error: {e}")
            retries += 1
            time.sleep(1)  # Wait for 1 second before retrying
    return None

# Get latitude and longitude for each location
df[['Latitude', 'Longitude']] = df['Location'].apply(get_coordinates).apply(pd.Series)

# Function to calculate distance using haversine formula
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # Radius of the Earth in kilometers
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2) ** 2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# Calculate distances between locations
df['Distance'] = df.apply(
    lambda row: calculate_distance(row['Latitude'], row['Longitude'], row['Latitude'], row['Longitude']), axis=1
)

# Rest of the code...
# ...


In [None]:
df = df.dropna()

# Step 3: Perform clustering based on city
scaler = StandardScaler()
df_scaled_city = scaler.fit_transform(df[['Latitude', 'Longitude']])
kmeans_city = KMeans(n_clusters=6)  # Replace '3' with the desired number of clusters for cities
city_clusters = kmeans_city.fit_predict(df_scaled_city)

# Step 4: Perform clustering based on location within each city
df['CityCluster'] = city_clusters  # Add city cluster labels to the dataframe
df_clusters = []

for city_cluster_id in range(len(kmeans_city.cluster_centers_)):
    df_city_cluster = df[df['CityCluster'] == city_cluster_id]
    df_scaled_location = scaler.fit_transform(df_city_cluster[['Distance']])
    kmeans_location = KMeans(n_clusters=50)  # Replace '4' with the desired number of clusters for locations within each city
    location_clusters = kmeans_location.fit_predict(df_scaled_location)
    df_city_cluster['LocationCluster'] = location_clusters
    df_clusters.append(df_city_cluster)

# Step 5: Prepare the feature set for regression analysis
features = [
    'No. of Bedrooms', 'Resale', 'MaintenanceStaff', 'Gymnasium', 'SwimmingPool', 'LandscapedGardens',
    'JoggingTrack', 'RainWaterHarvesting', 'IndoorGames', 'ShoppingMall', 'Intercom', 'SportsFacility',
    'ATM', 'ClubHouse', 'School', '24X7Security', 'PowerBackup', 'CarParking'
]

# Remove 'Wifi' and 'Wardrobe' from features list
features = [feature for feature in features if feature not in ['Wifi', 'Wardrobe']]

# Step 6: Perform regression analysis for each cluster
for df_cluster in df_clusters:
    X = df_cluster[features]
    y = df_cluster['Price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    regression_model = LinearRegression()
    regression_model.fit(X_train, y_train)

    y_pred = regression_model.predict(X_test)
    print(y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"Cluster Mean Squared Error: {mse}")