In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df = pd.read_csv('Housing.csv')


In [11]:
import seaborn as sns

In [12]:
%matplotlib inline

In [13]:
# Display basic information about the dataset
print(df.shape)
print(df.head(10))
print(df.describe())


(545, 13)
      price   area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000   7420         4          2        3      yes        no       no   
1  12250000   8960         4          4        4      yes        no       no   
2  12250000   9960         3          2        2      yes        no      yes   
3  12215000   7500         4          2        2      yes        no      yes   
4  11410000   7420         4          1        2      yes       yes      yes   
5  10850000   7500         3          3        1      yes        no      yes   
6  10150000   8580         4          3        4      yes        no       no   
7  10150000  16200         5          3        2      yes        no       no   
8   9870000   8100         4          1        2      yes       yes      yes   
9   9800000   5750         3          2        4      yes       yes       no   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes     

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [15]:
df.keys


<bound method NDFrame.keys of         price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no  

In [16]:
mr = pd.get_dummies(df['mainroad'],drop_first=True)
guest = pd.get_dummies(df['guestroom'],drop_first=True)
base = pd.get_dummies(df['basement'],drop_first=True)
hwh = pd.get_dummies(df['hotwaterheating'],drop_first=True)
ac = pd.get_dummies(df['airconditioning'],drop_first=True)
prefar = pd.get_dummies(df['prefarea'],drop_first=True)

In [17]:
df.drop(['mainroad','guestroom','basement','hotwaterheating', 'airconditioning', 'prefarea'],axis=1,inplace=True)

In [18]:
df['mainroad']=mr
df['guestroom']=guest
df['basement']=base
df['hotwaterheating']=hwh
df['airconditioning']=ac
df['prefarea']=prefar

In [19]:
df['prefarea'].head(10)

0     True
1    False
2     True
3     True
4    False
5     True
6     True
7    False
8     True
9     True
Name: prefarea, dtype: bool

In [20]:
def update(x):
    if x=="furnished":
        return 2
    elif x=="semi-furnished":
        return 1
    elif x=="unfurnished":
        return 0

In [21]:
df['furnishingstatus']=df['furnishingstatus'].apply(update)

In [22]:
#Adding features
def secure(x):
    if x>=4000000:
        return 1
    else:
        return 0

In [23]:
df['security']=df['price'].apply(secure)

In [24]:
def add_gym(x):
    if x<5000:
        return 0
    else:
        return 1

In [25]:
df['gym']=df['area'].apply(add_gym)

In [26]:
def add_club(x):
    if x<8000:
        return 0
    else:
        return 1

In [27]:
df['clubhouse']=df['area'].apply(add_club)

In [28]:
df.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,security,gym,clubhouse
0,13300000,7420,4,2,3,2,2,True,False,False,False,True,True,1,1,0
1,12250000,8960,4,4,4,3,2,True,False,False,False,True,False,1,1,1
2,12250000,9960,3,2,2,2,1,True,False,True,False,False,True,1,1,1
3,12215000,7500,4,2,2,3,2,True,False,True,False,True,True,1,1,0
4,11410000,7420,4,1,2,2,2,True,True,True,False,True,False,1,1,0
5,10850000,7500,3,3,1,2,1,True,False,True,False,True,True,1,1,0
6,10150000,8580,4,3,4,2,1,True,False,False,False,True,True,1,1,1
7,10150000,16200,5,3,2,0,0,True,False,False,False,False,False,1,1,1
8,9870000,8100,4,1,2,2,2,True,True,True,False,True,True,1,1,1
9,9800000,5750,3,2,4,1,0,True,True,False,False,True,True,1,1,0


In [29]:
def prscale(x):
    return int(x/1000)

In [30]:
df['price']=df['price'].apply(prscale)

In [31]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking,furnishingstatus,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea,security,gym,clubhouse
0,13300,7420,4,2,3,2,2,True,False,False,False,True,True,1,1,0
1,12250,8960,4,4,4,3,2,True,False,False,False,True,False,1,1,1
2,12250,9960,3,2,2,2,1,True,False,True,False,False,True,1,1,1
3,12215,7500,4,2,2,3,2,True,False,True,False,True,True,1,1,0
4,11410,7420,4,1,2,2,2,True,True,True,False,True,False,1,1,0


In [32]:
#X = df.drop('price', axis=1)
#y = df['price']
X=df.iloc[:, 1:].values

In [33]:
y=df['price'].values

In [34]:
print(y)

[13300 12250 12250 12215 11410 10850 10150 10150  9870  9800  9800  9681
  9310  9240  9240  9100  9100  8960  8890  8855  8750  8680  8645  8645
  8575  8540  8463  8400  8400  8400  8400  8400  8295  8190  8120  8080
  8043  7980  7962  7910  7875  7840  7700  7700  7560  7560  7525  7490
  7455  7420  7420  7420  7350  7350  7350  7350  7343  7245  7210  7210
  7140  7070  7070  7035  7000  6930  6930  6895  6860  6790  6790  6755
  6720  6685  6650  6650  6650  6650  6650  6650  6629  6615  6615  6580
  6510  6510  6510  6475  6475  6440  6440  6419  6405  6300  6300  6300
  6300  6300  6293  6265  6230  6230  6195  6195  6195  6160  6160  6125
  6107  6090  6090  6090  6083  6083  6020  6020  6020  5950  5950  5950
  5950  5950  5950  5950  5950  5943  5880  5880  5873  5873  5866  5810
  5810  5810  5803  5775  5740  5740  5740  5740  5740  5652  5600  5600
  5600  5600  5600  5600  5600  5600  5600  5565  5565  5530  5530  5530
  5523  5495  5495  5460  5460  5460  5460  5425  5

In [35]:
y=y.reshape(len(y),1)

In [36]:
print(y)

[[13300]
 [12250]
 [12250]
 [12215]
 [11410]
 [10850]
 [10150]
 [10150]
 [ 9870]
 [ 9800]
 [ 9800]
 [ 9681]
 [ 9310]
 [ 9240]
 [ 9240]
 [ 9100]
 [ 9100]
 [ 8960]
 [ 8890]
 [ 8855]
 [ 8750]
 [ 8680]
 [ 8645]
 [ 8645]
 [ 8575]
 [ 8540]
 [ 8463]
 [ 8400]
 [ 8400]
 [ 8400]
 [ 8400]
 [ 8400]
 [ 8295]
 [ 8190]
 [ 8120]
 [ 8080]
 [ 8043]
 [ 7980]
 [ 7962]
 [ 7910]
 [ 7875]
 [ 7840]
 [ 7700]
 [ 7700]
 [ 7560]
 [ 7560]
 [ 7525]
 [ 7490]
 [ 7455]
 [ 7420]
 [ 7420]
 [ 7420]
 [ 7350]
 [ 7350]
 [ 7350]
 [ 7350]
 [ 7343]
 [ 7245]
 [ 7210]
 [ 7210]
 [ 7140]
 [ 7070]
 [ 7070]
 [ 7035]
 [ 7000]
 [ 6930]
 [ 6930]
 [ 6895]
 [ 6860]
 [ 6790]
 [ 6790]
 [ 6755]
 [ 6720]
 [ 6685]
 [ 6650]
 [ 6650]
 [ 6650]
 [ 6650]
 [ 6650]
 [ 6650]
 [ 6629]
 [ 6615]
 [ 6615]
 [ 6580]
 [ 6510]
 [ 6510]
 [ 6510]
 [ 6475]
 [ 6475]
 [ 6440]
 [ 6440]
 [ 6419]
 [ 6405]
 [ 6300]
 [ 6300]
 [ 6300]
 [ 6300]
 [ 6300]
 [ 6293]
 [ 6265]
 [ 6230]
 [ 6230]
 [ 6195]
 [ 6195]
 [ 6195]
 [ 6160]
 [ 6160]
 [ 6125]
 [ 6107]
 [ 6090]
 [ 6090]
 

In [37]:
def update(x):
    if x=="furnished":
        return 2
    elif x=="semi-furnished":
        return 1
    elif x=="unfurnished":
        return 0

In [38]:
#X_encoded = pd.get_dummies(X, drop_first=True)

In [39]:
#X_encoded

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train[:, 1:7]=sc_X.fit_transform(X_train[:, 1:7])
X_test[:, 1:7]=sc_X.transform(X_test[:, 1:7])
sc_y = StandardScaler()
y_train= sc_y.fit_transform(y_train)
y_test=sc_y.transform(y_test)

In [42]:
rf = RandomForestRegressor(n_estimators=25, random_state=2)


In [43]:
rf.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [44]:
y_pred = rf.predict(X_test)

In [36]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# Evaluate the model
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = mean_squared_error(y_test,y_pred, squared=False)
r2 = r2_score(y_test,y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

Mean Absolute Error: 0.47472720139723334
Mean Squared Error: 0.5104239530800851
Root Mean Squared Error: 0.7144396077206843
R-squared: 0.6886326032679311


In [38]:
#importances = pd.Series(data=rf.feature_importances_, index=X_train.columns)

In [39]:
#importances_sorted = importances.sort_values()


In [104]:
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

NameError: name 'importances_sorted' is not defined