In [None]:
import csv
import numpy as np
import statistics
import pandas as pd
data = pd.read_csv('property_listing_data_in_Bangladesh.csv')

In [None]:
data.head(100)
len(data)

7557

In [None]:
new_data=data[(data['type'] != 'Duplex') & (data['type'] != 'Building')]
new_data['type']

0       Apartment
1       Apartment
2       Apartment
3       Apartment
4       Apartment
          ...    
7551    Apartment
7553    Apartment
7554    Apartment
7555    Apartment
7556    Apartment
Name: type, Length: 7489, dtype: object

In [None]:
new_data = new_data.drop(['title', 'adress', 'type', 'purpose', 'flooPlan', 'url', 'lastUpdated'] ,axis=1)
new_data

Unnamed: 0,beds,bath,area,price
0,3,4,"2,200 sqft",50 Thousand
1,3,4,"1,400 sqft",30 Thousand
2,3,4,"1,950 sqft",30 Thousand
3,3,3,"2,000 sqft",35 Thousand
4,3,4,"1,650 sqft",25 Thousand
...,...,...,...,...
7551,3,4,"1,888 sqft",50 Thousand
7553,3,2,900 sqft,19 Thousand
7554,2,2,"1,000 sqft",22 Thousand
7555,3,4,"3,600 sqft",1.75 Lakh


In [None]:
new_data['bath'] = new_data['bath'].str.extract('(\d+)')
new_data['bath'] = pd.to_numeric(new_data['bath'])
new_data['beds'] = new_data['beds'].str.extract('(\d+)')
new_data['beds'] = pd.to_numeric(new_data['beds'])
new_data['area'] = new_data['area'].str.replace(',', '')
new_data['area'] = new_data['area'].str.extract('(\d+)')
new_data['area'] = pd.to_numeric(new_data['area'])


In [None]:
new_data['price'] = new_data['price'].apply(lambda x: float(x.split(' ')[0]) * 100000.0 if 'Lakh' in x else float(x.split(' ')[0]) * 1000)


In [None]:
new_data.head(10)

Unnamed: 0,beds,bath,area,price
0,3,4,2200,50000.0
1,3,4,1400,30000.0
2,3,4,1950,30000.0
3,3,3,2000,35000.0
4,3,4,1650,25000.0
5,5,5,3400,110000.0
6,3,3,1600,35000.0
7,3,3,1250,23000.0
8,3,4,2150,40000.0
9,3,3,1250,23000.0


In [None]:
new_data=new_data.to_numpy()

In [None]:
type(new_data)

numpy.ndarray

In [None]:
new_data

array([[3.00e+00, 4.00e+00, 2.20e+03, 5.00e+04],
       [3.00e+00, 4.00e+00, 1.40e+03, 3.00e+04],
       [3.00e+00, 4.00e+00, 1.95e+03, 3.00e+04],
       ...,
       [2.00e+00, 2.00e+00, 1.00e+03, 2.20e+04],
       [3.00e+00, 4.00e+00, 3.60e+03, 1.75e+05],
       [4.00e+00, 4.00e+00, 2.60e+03, 9.00e+04]])

In [None]:

from sklearn.model_selection import train_test_split
def z_score_scaling(dataset):
    mean_vals = np.mean(dataset, axis=0)
    stdev_vals = np.std(dataset, axis=0)
    scaled_dataset = (dataset - mean_vals) / stdev_vals
    return scaled_dataset


modified_list=np.array(new_data)
X=modified_list[:,:-1]
Y=modified_list[:,-1]
X=z_score_scaling(X)
X=np.array(X)
X_train,X_test,y_train,y_test = train_test_split(X,Y, test_size=0.4, random_state=1)
X_valid,X_test,y_valid,y_test = train_test_split(X_test,y_test, test_size=0.5, random_state=1)
print("Train data:", len(X_train))
print("Test data:", len(X_valid))
print("Validation data:", len(X_test))

Train data: 4493
Test data: 1498
Validation data: 1498


In [None]:
b_init = 10
w_init = np.array([ .2, .2,50])
def compute_cost(X, y, w, b):
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)
    return cost

In [None]:
def compute_gradient(X, y, w, b):
    m,n = X.shape
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):
        err = (np.dot(X[i], w) + b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i, j]
        dj_db = dj_db + err
    dj_dw = dj_dw / m
    dj_db = dj_db / m

    return dj_db, dj_dw
tmp_dj_db, tmp_dj_dw = compute_gradient(X_train, y_train, w_init, b_init)


In [None]:
import copy
import matplotlib.pyplot as plt
def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters):

    w = copy.deepcopy(w_in)
    b = b_in

    for i in range(num_iters):

        dj_db,dj_dw = gradient_function(X, y, w, b)
        w = w - alpha * dj_dw
        b = b - alpha * dj_db

        print(f"Training Loss: {compute_cost(X_train,y_train,w,b)}")
        print(f"Validation Loss: {compute_cost(X_valid,y_valid,w,b)}")
    return w, b

In [None]:
initial_w = np.zeros_like(w_init)
print(initial_w)
initial_b = 1
iterations = 1500
alpha = 0.01
# run gradient descent
w_final, b_final = gradient_descent(X_train, y_train, initial_w, initial_b,
                                                    compute_cost, compute_gradient,
                                                    alpha, iterations)
print(f"b,w found by gradient descent: {b_final:0.2f},{w_final} ")

[0. 0. 0.]
Training Loss: 1223855180.632734
Validation Loss: 1498615749.2918622
Training Loss: 1199025458.5724714
Validation Loss: 1470609018.4702244
Training Loss: 1175087235.9186878
Validation Loss: 1443585497.0537493
Training Loss: 1152002680.7718582
Validation Loss: 1417504059.3460221
Training Loss: 1129735693.0207024
Validation Loss: 1392325442.2883534
Training Loss: 1108251822.3658283
Validation Loss: 1368012157.9403944
Training Loss: 1087518190.2798653
Validation Loss: 1344528410.1427484
Training Loss: 1067503415.7139624
Validation Loss: 1321840015.1600463
Training Loss: 1048177544.3696527
Validation Loss: 1299914326.112999
Training Loss: 1029511981.3638861
Validation Loss: 1278720161.0169783
Training Loss: 1011479427.1234165
Validation Loss: 1258227734.253584
Training Loss: 994053816.3525275
Validation Loss: 1238408591.3099651
Training Loss: 977210259.925892
Validation Loss: 1219235546.6287925
Training Loss: 960924989.565141
Validation Loss: 1200682624.4191697
Training Loss: 94

In [None]:
def predict(X,w,b,test):
  n=X.shape[0]
  for i in range(n):
    y_pred.append(np.dot(X[i],w)+b)
    print(test[i],y_pred[i])

In [None]:
y_pred=[]
predict(X_test,w_final,b_final,y_test)

10000.0 9615.010987082063
32000.0 19128.94387587045
29000.0 36126.41349453696
9500.0 22816.529598916706
18000.0 15106.058858115715
20000.0 29327.425647070355
20000.0 36414.505293849914
30000.0 36126.41349453696
20000.0 69101.50455474999
10000.0 3779.363626010221
75000.0 113630.22485799139
250000.0 192147.05230848122
16000.0 15729.449952137147
60000.0 69724.89564877142
30000.0 70121.35273186998
15000.0 36001.91592533707
15000.0 19525.40095896901
12000.0 12618.0478277168
200000.0 131024.15155975646
14000.0 36001.91592533707
17000.0 8930.462104670543
19000.0 9218.553903983498
75000.0 76632.24878002363
14000.0 9326.919187769105
22000.0 22528.43779960375
35000.0 22528.43779960375
12000.0 40210.45630068178
16000.0 26607.830508083713
20000.0 15729.449952137147
28000.0 36126.41349453696
23000.0 49327.932106371605
14000.0 26324.388806435614
10500.0 -979.9278672164
14000.0 36001.91592533707
16000.0 2131.4742572039395
50000.0 39129.4503351717
20000.0 14029.702990270496
11000.0 6215.517063348761
2

In [None]:
from sklearn.metrics import r2_score
r2=r2_score(y_test,y_pred)
print(r2)

0.5947859235472304
