## Importing the libraries

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import  ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

import joblib

import warnings
warnings.filterwarnings('ignore')

## Load and Prepare Data

In [None]:
#Read the dataset and print the top 5 elements of the dataset
df = pd.read_csv('housePrice.csv')
df.head()

In [None]:
df.sample(5)

In [None]:
type(df)

# Explore the data (EDA)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Area'] = df['Area'].apply(lambda x: re.sub(',', '', x))
df["Area"] = pd.to_numeric(df["Area"] , errors='coerce')

In [None]:
df.info()

In [None]:
df

In [None]:
df.shape

In [None]:
# check for data type
print(df.dtypes)

In [None]:
df.isnull().sum()

In [None]:
# I will drop missing values although it is not necessary because used models can manage missing values.
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df['Parking'].value_counts(normalize=True)*100

In [None]:
pd.crosstab( df.Parking, df.Room )

In [None]:
below_85 = df[ df.Area <= 85 ]
len(below_85)

In [None]:
df[ df.Area <= 85 ]["Parking"].value_counts( )

In [None]:
df.Address.unique()

In [None]:
len(df.Address.unique())

In [None]:
round(df.Price.mean())

In [None]:
df.groupby( 'Room' )['Price'].mean()

In [None]:
pd.set_option('display.float_format', lambda x: '%.f' % x)

In [None]:
room_parking_room_mean_df = df.groupby( ['Room','Parking'] )['Price'].mean().reset_index()
room_parking_room_mean_df

In [None]:
df2 = df.copy()

In [None]:
df2.loc[:, "Price"] =df["Price"].map('{:,.0f}'.format)

In [None]:
df2

In [None]:
df.dtypes

# Visualizaion

In [None]:
plt.figure(figsize=(8,5))
sns.displot(df['Price'] , bins=30 , kde=True )

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(18,6))

colors = [['#ADEFD1FF', '#00203FFF'], ['#97BC62FF', '#2C5F2D'], ['#F5C7B8FF', '#FFA177FF']]
explode = [0, 0.2]
columns = ['Parking', 'Warehouse', 'Elevator']
for i in range(3):
        data = df[columns[i]].value_counts()
        ax[i].pie(data, labels=data.values, explode=explode, colors=colors[i], shadow=True)
        ax[i].legend(labels=data.index, fontsize='large')
        ax[i].set_title('{} distribution'.format(columns[i]))

In [None]:
df3 = df['Address'].value_counts().copy()
df3 = df3[:8]


In [None]:
fig, ax = plt.subplots(figsize=(6,10))
sns.barplot(x=df3.values, y=df3.index,ax=ax)
plt.xlabel('numerical amount of flats')
plt.title('Number of flats in location')

In [None]:
df.columns

In [None]:
df.sort_values('Price',ascending=False)[['Address']].head(20)

In [None]:
df.isnull().sum()

In [None]:
df

# Model Building

In [None]:
df = df.drop(columns = ['Price(USD)'])
boolean_features = ['Parking','Warehouse','Elevator']
df[boolean_features] = df[boolean_features].astype('int64')

df.head()

In [None]:
print(f"Skewness of features:\n{df.skew()}")


In [None]:
plt.figure(figsize = (16,8))

plt.subplot(2,1,1)
sns.boxplot(x = df['Area'])

plt.subplot(2,1,2)
sns.boxplot(x = df['Price'])

In [None]:
def lower_upper(x):
    Q1 = np.percentile(x, 25)
    Q3 = np.percentile(x, 75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    
    return lower, upper

lower_area, upper_area = lower_upper(df['Area'])
lower_price, upper_price = lower_upper(df['Price'])

print(f"Lower limit for area: {lower_area:0.2f}")
print(f"Upper limit for area: {upper_area:0.2f}")
print(f"Lower limit for price: {lower_price:,}")
print(f"Upper limit for price: {upper_price:,}")

In [None]:
area_outliers = np.where(df['Area'] > upper_area)
price_outliers = np.where(df['Price'] > upper_price)
# Return the unique, sorted array of values that are in either of the two input arrays.
total_outliers = np.union1d(area_outliers, price_outliers)

print(f"Number of area outliers: {len(df.iloc[area_outliers])}")
print(f"Number of price outliers: {len(df.iloc[price_outliers])}")
print(f"Number of outliers: {len(df.iloc[total_outliers])}")

In [None]:
total_outliers

In [None]:
address_dummy = pd.get_dummies(df['Address'])
df_final = df.merge(address_dummy, left_index = True, right_index = True)
df_final.drop(columns = 'Address', inplace = True)
df_final.head(3)

In [None]:
df_final

In [None]:
X = df_final.drop(columns = 'Price')
y = df_final['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(f"shape of x train: {X_train.shape}")
print(f"shape of y train: {y_train.shape}")
print(f"shape of x test: {X_test.shape}")
print(f"shape of y train: {y_test.shape}")

In [None]:
def parameter_finder (model, parameters):
    
    start = time.time()
    
    grid = GridSearchCV(model, 
                        param_grid = parameters, 
                        refit = True, 
                        cv = KFold(shuffle = True, random_state = 1), 
                        n_jobs = -1)
    grid_fit = grid.fit(X_train, y_train)
    y_train_pred = grid_fit.predict(X_train)
    y_pred = grid_fit.predict(X_test)
    
    train_score =grid_fit.score(X_train, y_train)
    test_score = grid_fit.score(X_test, y_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    
    model_name = str(model).split('(')[0]
    
    end = time.time()
    
    print(f"The best parameters for {model_name} model is: {grid_fit.best_params_}")
    print("--" * 10)
    print(f"(R2 score) in the training set is {train_score:0.2%} for {model_name} model.")
    print(f"(R2 score) in the testing set is {test_score:0.2%} for {model_name} model.")
    print(f"RMSE is {RMSE:,} for {model_name} model.")
    print("--" * 10)
    print(f"Runtime of the program is: {end - start:0.2f}")
    
       
    return train_score, test_score, RMSE

# Prediction

In [None]:
lr = LinearRegression().fit(X_train,y_train)

In [None]:
X_cols = ['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Abazar',
        'Abbasabad', 'Absard', 'Abuzar', 'Afsarieh', 'Ahang', 'Air force',
        'Ajudaniye', 'Alborz Complex', 'Aliabad South', 'Amir Bahador',
        'Amirabad', 'Amirieh', 'Andisheh', 'Aqdasieh', 'Araj', 'Argentina',
        'Atabak', 'Azadshahr', 'Azarbaijan', 'Azari', 'Baghestan', 'Bahar',
        'Baqershahr', 'Beryanak', 'Boloorsazi', 'Central Janatabad',
        'Chahardangeh', 'Chardangeh', 'Chardivari', 'Chidz', 'Damavand',
        'Darabad', 'Darakeh', 'Darband', 'Daryan No', 'Dehkade Olampic',
        'Dezashib', 'Dolatabad', 'Dorous', 'East Ferdows Boulevard',
        'East Pars', 'Ekbatan', 'Ekhtiarieh', 'Elahieh', 'Elm-o-Sanat',
        'Enghelab', 'Eram', 'Eskandari', 'Fallah', 'Farmanieh', 'Fatemi',
        'Feiz Garden', 'Firoozkooh', 'Firoozkooh Kuhsar', 'Gandhi',
        'Garden of Saba', 'Gheitarieh', 'Ghiyamdasht', 'Ghoba', 'Gholhak',
        'Gisha', 'Golestan', 'Haft Tir', 'Hakimiyeh', 'Hashemi', 'Hassan Abad',
        'Hekmat', 'Heravi', 'Heshmatieh', 'Hor Square', 'Islamshahr',
        'Islamshahr Elahieh', 'Javadiyeh', 'Jeyhoon', 'Jordan', 'Kahrizak',
        'Kamranieh', 'Karimkhan', 'Karoon', 'Kazemabad', 'Keshavarz Boulevard',
        'Khademabad Garden', 'Khavaran', 'Komeil', 'Koohsar', 'Kook', 'Lavasan',
        'Lavizan', 'Mahallati', 'Mahmoudieh', 'Majidieh', 'Malard', 'Marzdaran',
        'Mehrabad', 'Mehrabad River River', 'Mehran', 'Mirdamad',
        'Mirza Shirazi', 'Moniriyeh', 'Narmak', 'Nasim Shahr', 'Nawab',
        'Naziabad', 'Nezamabad', 'Niavaran', 'North Program Organization',
        'Northern Chitgar', 'Northern Janatabad', 'Northern Suhrawardi',
        'Northren Jamalzadeh', 'Ostad Moein', 'Ozgol', 'Pakdasht',
        'Pakdasht KhatunAbad', 'Parand', 'Parastar', 'Pardis', 'Pasdaran',
        'Persian Gulf Martyrs Lake', 'Pirouzi', 'Pishva', 'Punak', 'Qalandari',
        'Qarchak', 'Qasr-od-Dasht', 'Qazvin Imamzadeh Hassan', 'Railway', 'Ray',
        'Ray - Montazeri', 'Ray - Pilgosh', 'Razi', 'Republic', 'Robat Karim',
        'Rudhen', 'Saadat Abad', 'SabaShahr', 'Sabalan', 'Sadeghieh',
        'Safadasht', 'Salehabad', 'Salsabil', 'Sattarkhan', 'Seyed Khandan',
        'Shadabad', 'Shahedshahr', 'Shahr-e-Ziba', 'ShahrAra',
        'Shahrake Apadana', 'Shahrake Azadi', 'Shahrake Gharb',
        'Shahrake Madaen', 'Shahrake Qods', 'Shahrake Quds',
        'Shahrake Shahid Bagheri', 'Shahrakeh Naft', 'Shahran', 'Shahryar',
        'Shams Abad', 'Shoosh', 'Si Metri Ji', 'Sohanak', 'Southern Chitgar',
        'Southern Janatabad', 'Southern Program Organization',
        'Southern Suhrawardi', 'Tajrish', 'Tarasht', 'Taslihat', 'Tehran Now',
        'Tehransar', 'Telecommunication', 'Tenant', 'Thirteen November',
        'Vahidieh', 'Vahidiyeh', 'Valiasr', 'Vanak', 'Varamin - Beheshti',
        'Velenjak', 'Villa', 'Water Organization', 'Waterfall',
        'West Ferdows Boulevard', 'West Pars', 'Yaftabad', 'Yakhchiabad',
        'Yousef Abad', 'Zafar', 'Zaferanieh', 'Zargandeh', 'Zibadasht']

In [None]:
sample_data = [100, 2, 1, 1, 1] + [False] * (len(data) - 5)
sample_data[X_cols.index('Tajrish')] = True

In [None]:
sample_data = np.array(sample_data).reshape(1, -1)
prediction = lr.predict(sample_data)
print(prediction)


In [None]:
import numpy as np


sample_data = [100, 2, 1, 1, 1] + [False] * (len(data) - 5)
predictions = []

for i in data[data.index('Abazar'):]:
    sample_data[data.index(i)] = True
    sample_data_reshaped = np.array(sample_data).reshape(1, -1)
    prediction = int(lr.predict(sample_data_reshaped)[0])
    predictions.append(prediction)

print(predictions)


In [113]:
# print(min(predictions))
# print(max(predictions))

-50941169026
68491174368


In [None]:
import matplotlib.pyplot as plt

# Assuming you have the 'predictions' list
plt.rcParams.update({'font.size': 2})

x_values = range(len(predictions))

plt.plot(x_values, predictions, marker='o')
plt.xlabel('Area Index',labelpad=10)
plt.ylabel('House Price')
plt.title('Predicted House Prices')
plt.xticks(x_values, data[data.index('Abazar'):], rotation=90)
plt.tight_layout()
plt.show()


In [None]:
lr_train_score, lr_test_score, lr_RMSE = parameter_finder(lr, {})