In [1]:
import pandas as pd

In [2]:
!gdown 17avT1w01dj9ELs8UiJS7yOmQkjd06Q3h

Downloading...
From: https://drive.google.com/uc?id=17avT1w01dj9ELs8UiJS7yOmQkjd06Q3h
To: /content/housePrice.csv
  0% 0.00/155k [00:00<?, ?B/s]100% 155k/155k [00:00<00:00, 49.5MB/s]


In [3]:
df = pd.read_csv('housePrice.csv')

In [4]:
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
0,63,1,True,True,True,Shahran,1850000000
1,60,1,True,True,True,Shahran,1850000000
2,79,2,True,True,True,Pardis,550000000
3,95,2,True,True,True,Shahrake Qods,902500000
4,123,2,True,True,True,Shahrake Gharb,7000000000
...,...,...,...,...,...,...,...
3468,86,2,True,True,True,Southern Janatabad,3500000000
3469,83,2,True,True,True,Niavaran,6800000000
3470,75,2,False,False,False,Parand,365000000
3471,105,2,True,True,True,Dorous,5600000000


## Preprocessing

### 1

In [5]:
df.isna().sum()

Area          0
Room          0
Parking       0
Warehouse     0
Elevator      0
Address      23
Price         0
dtype: int64

In [6]:
df.dropna()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
0,63,1,True,True,True,Shahran,1850000000
1,60,1,True,True,True,Shahran,1850000000
2,79,2,True,True,True,Pardis,550000000
3,95,2,True,True,True,Shahrake Qods,902500000
4,123,2,True,True,True,Shahrake Gharb,7000000000
...,...,...,...,...,...,...,...
3468,86,2,True,True,True,Southern Janatabad,3500000000
3469,83,2,True,True,True,Niavaran,6800000000
3470,75,2,False,False,False,Parand,365000000
3471,105,2,True,True,True,Dorous,5600000000


### 2

In [14]:
df['Price'].describe()

count    3.473000e+03
mean     5.354621e+09
std      8.104058e+09
min      3.600000e+06
25%      1.415000e+09
50%      2.880000e+09
75%      6.000000e+09
max      9.240000e+10
Name: Price, dtype: float64

In [15]:
count,mean,std,min,Q1,Q2,Q3,max = df['Price'].describe()

In [18]:
def get_price_level(price, q1, q2, q3):
    if price < q1:
        return 'cheap'
    elif price <= q2:
        return 'underMean'
    elif price <= q3:
        return 'upperMean'
    else:
        return 'expensive'

In [19]:
df['priceLevel'] = df['Price'].apply(get_price_level, q1=Q1, q2=Q2, q3=Q3)

### 3

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
le = LabelEncoder()
df['Parking'] = le.fit_transform(df['Parking'])
df['Warehouse'] = le.fit_transform(df['Warehouse'])
df['Elevator'] = le.fit_transform(df['Elevator'])
df['Address'] = le.fit_transform(df['Address'])
df['priceLevel'] = le.fit_transform(df['priceLevel'])

In [22]:
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,priceLevel
0,63,1,1,1,1,156,1850000000,2
1,60,1,1,1,1,156,1850000000,2
2,79,2,1,1,1,117,550000000,0
3,95,2,1,1,1,152,902500000,0
4,123,2,1,1,1,150,7000000000,1
...,...,...,...,...,...,...,...,...
3468,86,2,1,1,1,163,3500000000,3
3469,83,2,1,1,1,105,6800000000,1
3470,75,2,0,0,0,115,365000000,0
3471,105,2,1,1,1,39,5600000000,3


### 4

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
scaler = MinMaxScaler()
df['Parking'] = scaler.fit_transform(df[['Parking']])
df['Warehouse'] = scaler.fit_transform(df[['Warehouse']])
df['Elevator'] = scaler.fit_transform(df[['Elevator']])
df['Address'] = scaler.fit_transform(df[['Address']])
df['Area'] = scaler.fit_transform(df[['Area']])
df['Room'] = scaler.fit_transform(df[['Room']])
df['Price'] = scaler.fit_transform(df[['Price']])

In [26]:
df

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price,priceLevel
0,0.036707,0.2,1.0,1.0,1.0,0.812500,0.019983,2
1,0.033370,0.2,1.0,1.0,1.0,0.812500,0.019983,2
2,0.054505,0.4,1.0,1.0,1.0,0.609375,0.005914,0
3,0.072303,0.4,1.0,1.0,1.0,0.791667,0.009729,0
4,0.103448,0.4,1.0,1.0,1.0,0.781250,0.075722,1
...,...,...,...,...,...,...,...,...
3468,0.062291,0.4,1.0,1.0,1.0,0.848958,0.037841,3
3469,0.058954,0.4,1.0,1.0,1.0,0.546875,0.073557,1
3470,0.050056,0.4,0.0,0.0,0.0,0.598958,0.003911,0
3471,0.083426,0.4,1.0,1.0,1.0,0.203125,0.060569,3


### 5

In [27]:
from sklearn.model_selection import train_test_split

In [35]:
X = df.drop('priceLevel', axis=1)
y = df['priceLevel']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [37]:
print(X_train)
print(y_train)

          Area  Room  Parking  Warehouse  Elevator   Address     Price
1822  0.150167   0.6      1.0        1.0       1.0  0.703125  0.113602
2335  0.030033   0.2      1.0        1.0       0.0  0.734375  0.009377
3350  0.055617   0.4      1.0        1.0       1.0  0.135417  0.032430
3049  0.133482   0.6      1.0        1.0       1.0  0.619792  0.008241
611   0.100111   0.4      1.0        1.0       1.0  0.791667  0.011325
...        ...   ...      ...        ...       ...       ...       ...
1095  0.083426   0.4      1.0        1.0       1.0  0.760417  0.044335
1130  0.211346   0.6      1.0        1.0       1.0  0.354167  0.166634
1294  0.063404   0.4      1.0        1.0       1.0  0.546875  0.049747
860   0.139043   0.6      1.0        1.0       1.0  0.781250  0.108190
3174  0.100111   0.4      1.0        1.0       1.0  0.661458  0.016585

[2778 rows x 7 columns]
1822    1
2335    0
3350    3
3049    0
611     0
       ..
1095    3
1130    1
1294    3
860     1
3174    2
Name: priceLe

In [38]:
print(X_test)
print(y_test)

          Area  Room  Parking  Warehouse  Elevator   Address     Price
3000  0.127920   0.6      1.0        1.0       1.0  0.791667  0.018793
1681  0.074527   0.4      1.0        1.0       1.0  0.848958  0.041954
812   0.105673   0.6      1.0        1.0       1.0  0.635417  0.068146
3164  0.027809   0.2      1.0        1.0       1.0  0.953125  0.023230
229   0.048943   0.4      1.0        1.0       1.0  0.052083  0.018382
...        ...   ...      ...        ...       ...       ...       ...
547   0.066741   0.4      1.0        1.0       1.0  0.635417  0.045417
764   0.066741   0.4      1.0        1.0       1.0  0.005208  0.043253
3428  0.063404   0.4      1.0        1.0       1.0  1.000000  0.007321
3245  0.038932   0.4      1.0        1.0       1.0  0.812500  0.025395
568   0.281424   0.6      1.0        1.0       1.0  0.984375  0.321402

[695 rows x 7 columns]
3000    2
1681    3
812     1
3164    2
229     2
       ..
547     3
764     3
3428    0
3245    2
568     1
Name: priceLev

In [39]:
print('X_train size:', X_train.shape)
print('y_train size:', y_train.shape)
print('X_test size:', X_test.shape)
print('y_test size:', y_test.shape)

X_train size: (2778, 7)
y_train size: (2778,)
X_test size: (695, 7)
y_test size: (695,)
