## Importing Needed Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

### Read dataset

In [3]:
df = pd.read_csv('./data/house_price.csv')
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
0,63,1,True,True,True,Shahran,1850000000.0,61666.67
1,60,1,True,True,True,Shahran,1850000000.0,61666.67
2,79,2,True,True,True,Pardis,550000000.0,18333.33
3,95,2,True,True,True,Shahrake Qods,902500000.0,30083.33
4,123,2,True,True,True,Shahrake Gharb,7000000000.0,233333.33


## Data Exploration

#### General Information
##### Here we notice that only in the address column there is empty information that needs to be processed

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3479 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3479 non-null   object 
 1   Room        3479 non-null   int64  
 2   Parking     3479 non-null   bool   
 3   Warehouse   3479 non-null   bool   
 4   Elevator    3479 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3479 non-null   float64
 7   Price(USD)  3479 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 146.2+ KB


#### General Description
##### which shows the additional description only for the room and the price because the others do not have numerical type and must be converted

In [10]:
df.describe()

Unnamed: 0,Room,Price,Price(USD)
count,3479.0,3479.0,3479.0
mean,2.079908,5359023000.0,178634.1
std,0.758275,8099935000.0,269997.8
min,0.0,3600000.0,120.0
25%,2.0,1418250000.0,47275.0
50%,2.0,2900000000.0,96666.67
75%,2.0,6000000000.0,200000.0
max,5.0,92400000000.0,3080000.0


#### Columns Name

In [6]:
df.columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Address', 'Price',
       'Price(USD)'],
      dtype='object')

#### Shape Dataset

In [7]:
df.shape

(3479, 8)

#### Number of unique data

In [8]:
df.nunique()

Area          243
Room            6
Parking         2
Warehouse       2
Elevator        2
Address       192
Price         934
Price(USD)    932
dtype: int64

### Drop null cells in the address

In [12]:
check_nan = df['Address'].isnull().values.any()
check_nan

True

#### count of null in address

In [14]:
count_nan = df['Address'].isnull().sum()
count_nan

23

#### Drop the null data

In [18]:
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3456 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3456 non-null   object 
 1   Room        3456 non-null   int64  
 2   Parking     3456 non-null   bool   
 3   Warehouse   3456 non-null   bool   
 4   Elevator    3456 non-null   bool   
 5   Address     3456 non-null   object 
 6   Price       3456 non-null   float64
 7   Price(USD)  3456 non-null   float64
dtypes: bool(3), float64(2), int64(1), object(2)
memory usage: 172.1+ KB


#### Check null in address

#### Now we have to process the type of the columns, because for example, the area column should be a number, but it is an object, so there is a problem in some cells, and also in other columns, such as parking, etc., the type should be changed and converted to a number.

#### If we want to change the type of the area, we get an error because it has strings in the information, so we first delete them and then change the type.

In [20]:
# df['Area'] = df['Area'].astype('int64')

#### With the following code, we can find out which lines have problems in the area column and see the same cells
#### And we see that they are not only strings but also outliers

In [22]:
unreasonable_numeric_area = df[~df['Area'].str.isnumeric()]
unreasonable_numeric_area

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,Price(USD)
570,3310000000,2,True,True,True,Ostad Moein,3310000000.0,110333.33
709,16160000000,3,True,True,True,Pasdaran,16160000000.0,538666.67
807,1000,2,True,True,False,Damavand,7000000000.0,233333.33
1604,8400000000,2,True,True,True,Gheitarieh,8700000000.0,290000.0
2171,3600,2,False,False,False,Shahryar,9720000000.0,324000.0
2802,2550000000,2,True,True,True,Central Janatabad,2550000000.0,85000.0


#### Now we drop those cells and then change the type

In [25]:
df.drop(df[~df['Area'].str.isnumeric()].index , inplace=True)
df['Area'] = df['Area'].astype('int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3450 entries, 0 to 3478
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3450 non-null   int64  
 1   Room        3450 non-null   int64  
 2   Parking     3450 non-null   bool   
 3   Warehouse   3450 non-null   bool   
 4   Elevator    3450 non-null   bool   
 5   Address     3450 non-null   object 
 6   Price       3450 non-null   float64
 7   Price(USD)  3450 non-null   float64
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 171.8+ KB


#### Replace boolian data in parking,warehouse,elevator and address to number

In [None]:
# for remove outlier

# Using Z - score to remove outliers
from scipy import stats
# Z score
z = np.abs(stats.zscore(df[df.dtypes[df.dtypes != 'object'].index]))
# Removing outliers
df = df[(z < 3).all(axis=1)]