In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import datetime as dt
import warnings 

warnings.simplefilter(action='ignore')

In [None]:
df = pd.read_csv(r'../input/paris-housing-price-prediction/ParisHousing.csv')
df.head()

In [None]:
data = df.copy()

In [None]:
data.shape

In [None]:
data.isna().sum()

# Desribe data for a better understanding of the cols

In [None]:
data.describe().T

In [None]:
data.dtypes

# Checking for unique values in some columns for a better visualisation

In [None]:
cols = data.columns 
for i in cols:
    print(f"The unique value in the cols {i} are {len(data[i].unique())}")

# Checking the value counts of each col

In [None]:
for i in cols[1:-1]:
    print(f'Value counts for {i} is {len(data[i].value_counts())}')

# EDA on the data

In [None]:
labels = data['numberOfRooms'].value_counts().index[:10]
values = data['numberOfRooms'].value_counts()[:10]

fig = px.pie(names=labels, values=values, hole=0.5)
fig.update_layout(title='Distribution based on Number of Rooms', template='plotly_dark', hoverlabel = dict(
    font_size=16,
    font_family='Helvetica'
))
fig.update_traces(hovertemplate='No. of Rooms: %{label}<br> Amount of such rooms in datasert: %{value}')
fig.show()

# Function to plot distribution of any col haing < 10 unique values

In [None]:
def pie_chart(col):
    labels = data[col].value_counts().index
    values = data[col].value_counts()

    fig = px.pie(names=labels, values=values, hole=0.5, color_discrete_sequence=['pink', 'red'])
    fig.update_layout(title=f'Distribution of House based on {col}', template='plotly_dark', hoverlabel = dict(
        font_size=16,
        font_family='Helvetica'
    ))
    fig.update_traces(hovertemplate=f'{col}' + ': %{label}<br>%{value}')
    return fig.show()

In [None]:
pie_chart('hasPool')

In [None]:
data['made'] = pd.to_datetime(data['made'], format='%Y')

In [None]:
data.head()

In [None]:
data['yearBuiltIn'] = data['made'].dt.year

In [None]:
data.drop(['cityCode', 'made'], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
fig = px.bar(data, x=data['yearBuiltIn'].value_counts().index,  y=data['yearBuiltIn'].value_counts())
fig.update_layout(title='Number of Houses built in each Year.', template='plotly_dark')
fig.update_traces(hovertemplate='%{x} : %{y}')
fig.update_xaxes(title="Year", showline=True, linewidth=1, linecolor='white')
fig.update_yaxes(title="Count", showline=True, linewidth=1, linecolor='white', showgrid=False)
fig.show()

In [None]:
year_df = data.groupby(by='yearBuiltIn').sum().reset_index()

In [None]:
fig = px.line(year_df, x='yearBuiltIn', y='price')
fig.update_layout(title='Sum of price of all houses in each year', template='plotly_dark', hoverlabel=dict(
    font_size=18,
    bgcolor='white'
))
fig.update_traces(hovertemplate='Year %{x}: Price %{y}')
fig.update_xaxes(title="Year", showline=True, linewidth=1, linecolor='white', showgrid=False)
fig.update_yaxes(title="Price", showline=True, linewidth=1, linecolor='white', showgrid=False)
fig.show()

In [None]:
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder, Normalizer, MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score

In [None]:
y = data['price']
X = data.drop('price', axis=1)

In [None]:
def normalizer(x_train, x_test):
  scaler = Normalizer()
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.fit_transform(x_test)
  return x_train, x_test

In [None]:
def minmax(x_train, x_test):
  scaler = MinMaxScaler()
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.fit_transform(x_test)
  return x_train, x_test

In [None]:
def stdscaler(x_train, x_test):
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_test = scaler.fit_transform(x_test)
  return x_train, x_test

In [None]:
def best_model(X, y, scaler, algo):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)

    X_train, X_test = scaler(X_train, X_test)

    model = algo()

    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    mse = mean_absolute_error(predictions, y_test)

    print(f'The MAE is {mean_absolute_error(predictions, y_test)}')
    print(f'The MSE is {mean_squared_error(predictions, y_test)}')

    fig = px.scatter(x=predictions, y=y_test, template='plotly_dark', title=f'Actual Productivity vs Predictions')
    fig.update_traces(hovertemplate='Predicted Value : %{x} <br> Actual Value: %{y}')
    fig.update_layout(hoverlabel=dict(
        font_size = 20,
        bgcolor = 'white', 
        font_family = 'Helvetica'
    ))
    fig.update_xaxes(title='Predicted Values', showgrid=False)
    fig.update_yaxes(title='Actual Values', showgrid=False)

    return predictions, y_test, mse, fig.show()

In [None]:
lpred, l_y_test,l_norm_mse, plot = best_model(X, y, normalizer, LinearRegression)

In [None]:
lpred, l_y_test, l_min_mse,  plot = best_model(X, y, minmax, LinearRegression)

In [None]:
lpred, l_y_test, l_std_mse, plot = best_model(X, y, stdscaler, LinearRegression)

In [None]:
dpred, d_y_test, d_norm_mse, plot = best_model(X, y, normalizer, DecisionTreeRegressor)

In [None]:
dpred, d_y_test, d_min_mse, plot = best_model(X, y, minmax, DecisionTreeRegressor)

In [None]:
dpred, d_y_test, d_std_mse, plot = best_model(X, y, stdscaler, DecisionTreeRegressor)

In [None]:
rpred, r_y_test, r_norm_mse, plot = best_model(X, y, normalizer,  RandomForestRegressor)

In [None]:
rpred, r_y_test, r_min_mse, plot = best_model(X, y, minmax,  RandomForestRegressor)

In [None]:
rpred, r_y_test, r_std_mse, plot = best_model(X, y, stdscaler,  RandomForestRegressor)

# Since Min Max Scaler gives the least amount od error we use that

In [None]:
labels = ['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor']
values = [l_min_mse, d_min_mse, r_min_mse]

fig = px.pie(names=labels, values=values, hole=0.6)
fig.update_layout(title='Mean Absolute Error wrt Algos', template='plotly_dark')
fig.update_traces(hovertemplate='%{label} : <br> MSE: %{value}')
fig.show()