#### Data source

https://www.kaggle.com/datasets/ruchi798/housing-prices-in-metropolitan-areas-of-india

#### As a continuation of:

https://www.kaggle.com/code/mosesgwaza/housing-prices-prediction-in-bangalore-india

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from ipywidgets import IntSlider, interact
from glob import glob
from scipy.stats import pearsonr

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


#### Building a function to:

1. Read a CSV file
2. Remove 9 values in the whole dataset as it does not fit the 1 and 0 OneHotEncoding
3. Remove 'Wifi' and 'Wardrobe' columns because they only contain the value 0
4. Remove 'Price' value hgiher than $100,000,000.00

In [None]:
def wrangle(filename):
    # Read a CSV file
    df = pd.read_csv(filename)
    
    # Removing 9 from the dataset to remain with 0 and 1 as yes or no
    df.replace(9, np.nan, inplace=True)
    df.dropna(inplace=True)
    
    # Removing columns 'Wifi' and 'Wardrobe' because they only contain 0 value only
    df.drop(columns = ['Wifi', 'Wardrobe'], inplace=True)
    
    # Removing 'Price' higher than 100000000
    maskprice = df["Price"]< 100000000
    df = df[maskprice]
    
    return df

Using **Glob** to store all the datasets in one list and then combining them together.

In [None]:
files = glob("../input/housing-prices-in-metropolitan-areas-of-india/*")
files

In [None]:
# Replace the placeholders with the actual file paths to your datasets
file_paths = [
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Bangalore.csv",
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Chennai.csv",
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Hyderabad.csv",
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Delhi.csv",
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Kolkata.csv",
    r"D:\Housing Prices Prediction\All Metropolitan Cities\Mumbai.csv"
]

frames = []
for file_path in file_paths:
    df = pd.read_csv(file_path)  # Adjust the function according to your dataset format
    frames.append(df)

df = pd.concat(frames, ignore_index=True)

In [None]:
city_names = [file_path.split("\\")[-1].split(".csv")[0] for file_path in file_paths]
for i, df in enumerate(frames):
    df["City"] = city_names[i]


In [None]:
# combining the datasets into a single dataframe

df = pd.concat(frames , ignore_index=True)
print(df.info())
df.head()

In [None]:
df.shape

#### The dataframe properties

After importing and merging the dataframe it is noted that the dataframe has:

1. 38 columns
2. 10063 records

#### Checking the correlation of 'Price' column in comparison with other columns

Using:
    
    1. Statistical representation
    2. Using a heatmap for graphical presentation
    3. Using Pearson correlation of the noted columns
    4. Using a Scatterplot to see the relationship
   

In [None]:
# Using a Statistical representation

df.corr()

In [None]:
# Using a heatmap
# Removing categorical data from the dataframe first

heatmap_df = df.select_dtypes(exclude='object')

plt.figure(figsize= (30,25))
sns.heatmap(heatmap_df.corr(), annot=True)

In [None]:
# Comparing 'Price' with 'Area' using Pearson correlation because they have a correlation of 0.73

pearsonr(df['Price'], df['Area'])

In [None]:
# Using a scatterplot to see the positive relationship
sns.scatterplot(x=df['Price'], y=df['Area'])
plt.xlabel('Price')
plt.ylabel('Area')
plt.title('India Metropolitan Areas: Price vs Area');

In [None]:
# Using a regression plot to just try to plot the line
sns.regplot(x = df['Price'], y=df['Area'])
plt.ylim(0,)
plt.xlabel('Price')
plt.ylabel('Area')
plt.title('Bangalore India : Price vs Area');

#### Time for Modeling

**The Machine Learning workflow**

    1. Data Preparation
        - import
        - explore
        - split
    
    2. Building a Model
        - baseline
        - iterate
        - evaluate

    3. Results communication


#### SPLIT

Splitting the Dataframe into feature matrix and target vector

In [None]:
target = "Price"
feature = ["Area","City"]
X_train = df[feature]
y_train = df[target]

Creating a baseline (the average mean)

In [None]:
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)


len(y_pred_baseline) == len(y_train)

Finding the Mean absolute Error (MAE)

In [None]:
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

In [None]:
y_mean - mae_baseline

The MAE error is \\$,869,901.59 whilst the Baseline MAE is \\$5,936,449.77. This means that by following this Baseline model, we would be off by about \\$3,933,451.8151296536

#### The next steps:
    
    1. Creating the model
    2. Fitting the model
    3. Predicting the model

In [None]:
# model = LinearRegression()

In [None]:
import tensorflow as tf
from tensorflow.keras import layers


In [None]:
model = tf.keras.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(7,)))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
from sklearn.preprocessing import OneHotEncoder
city_data = df["City"].values.reshape(-1, 1)
encoder = OneHotEncoder()
encoder.fit(city_data)
city_encoded = encoder.transform(city_data).toarray()

X_train = np.concatenate((city_encoded, df["Area"].values.reshape(-1, 1)), axis=1)

In [None]:
model.fit(X_train, y_train, epochs=25, batch_size=32)




In [None]:
print(X_train)

In [None]:
# model.fit(X_train, y_train)

In [None]:
y_pred_training = model.predict(X_train)
y_pred_training[:5]

In [None]:
mae_training = mean_absolute_error(y_train, y_pred_training)

print("Training MAE:", round(mae_training, 2))

In [None]:
# Seeing how much the Baseline Model has been beat
mae_baseline - mae_training

As shown, the performance of the model beat the baseline by 2310636.587893166

#### The formula for the model developed:

In [None]:
# intercept = round(model.intercept_, 2)
# print("Model Intercept:", intercept)

In [None]:
# coefficient = round(model.coef_[0], 2)
# print('Model coefficient for "Area":', coefficient)

In [None]:
# The formula

# print(f"apt_price = {intercept} + {coefficient} * Area")

#### Predictions


In [None]:
def make_prediction(area, city):
    city_encoded = encoder.transform([[city]])
    city_encoded_transformed = city_encoded.toarray().flatten().tolist()
    data = np.concatenate(([city_encoded_transformed], [[area]]), axis=1)
    prediction = model.predict(data).round(2)[0]
    return f"Predicted apartment price: ${prediction}"

make_prediction(3402, "Bangalore")



In [None]:
# Testing the model using 3402 as 'Area'
make_prediction(3402,"Bangalore")

In [None]:
print(X_train[:5])
print(X_train.shape)


In [None]:
interact(
    make_prediction,
    city=["Bangalore", "Chennai", "Hyderabad", "Delhi", "Kolkata", "Mumbai"],
    area=IntSlider(
        min=int(X_train[:, 6].min()),
        max=int(X_train[:, 6].max()),
        value=int(X_train[:, 6].mean()),
    )
)
