# Goal : Predict House Price

In [3]:
import pandas as pd
import numpy as np
import seaborn as scs

In [4]:
df = pd.read_csv("../ml_datasets/Housing.csv")

### Pre-proccesing 

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,1,42000.0,5850,3,1,2,yes,no,yes,no,no,1,no
1,2,38500.0,4000,2,1,1,yes,no,no,no,no,0,no
2,3,49500.0,3060,3,1,1,yes,no,no,no,no,0,no
3,4,60500.0,6650,3,1,2,yes,yes,no,no,no,0,no
4,5,61000.0,6360,2,1,1,yes,no,no,no,no,0,no


In [6]:
df.count()

Unnamed: 0    546
price         546
lotsize       546
bedrooms      546
bathrms       546
stories       546
driveway      546
recroom       546
fullbase      546
gashw         546
airco         546
garagepl      546
prefarea      546
dtype: int64

In [7]:
df.isna().sum()

Unnamed: 0    0
price         0
lotsize       0
bedrooms      0
bathrms       0
stories       0
driveway      0
recroom       0
fullbase      0
gashw         0
airco         0
garagepl      0
prefarea      0
dtype: int64

In [8]:
df.shape

(546, 13)

#### Conslusion : No NaN values required

### We've categorical features which we have to convert into number representation

In [9]:
df.columns

Index(['Unnamed: 0', 'price', 'lotsize', 'bedrooms', 'bathrms', 'stories',
       'driveway', 'recroom', 'fullbase', 'gashw', 'airco', 'garagepl',
       'prefarea'],
      dtype='object')

In [10]:
l1 = ['driveway', 'recroom', 'fullbase', 'gashw', 'airco', 'prefarea']

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in l1:
    df[x] = le.fit_transform(df[x])

In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,1,42000.0,5850,3,1,2,1,0,1,0,0,1,0
1,2,38500.0,4000,2,1,1,1,0,0,0,0,0,0
2,3,49500.0,3060,3,1,1,1,0,0,0,0,0,0
3,4,60500.0,6650,3,1,2,1,1,0,0,0,0,0
4,5,61000.0,6360,2,1,1,1,0,0,0,0,0,0


In [13]:
df.count().unique()

array([546])

### Now scaledown the lotsize

In [14]:
from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler()
df['lotsize'] = mmscaler.fit_transform(df[['lotsize']])
df.head()

Unnamed: 0.1,Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,1,42000.0,0.28866,3,1,2,1,0,1,0,0,1,0
1,2,38500.0,0.161512,2,1,1,1,0,0,0,0,0,0
2,3,49500.0,0.096907,3,1,1,1,0,0,0,0,0,0
3,4,60500.0,0.343643,3,1,2,1,1,0,0,0,0,0
4,5,61000.0,0.323711,2,1,1,1,0,0,0,0,0,0


In [15]:
df.corr

<bound method DataFrame.corr of      Unnamed: 0     price   lotsize  bedrooms  bathrms  stories  driveway   
0             1   42000.0  0.288660         3        1        2         1  \
1             2   38500.0  0.161512         2        1        1         1   
2             3   49500.0  0.096907         3        1        1         1   
3             4   60500.0  0.343643         3        1        2         1   
4             5   61000.0  0.323711         2        1        1         1   
..          ...       ...       ...       ...      ...      ...       ...   
541         542   91500.0  0.216495         3        2        4         1   
542         543   94000.0  0.298969         3        2        4         1   
543         544  103000.0  0.298969         3        2        4         1   
544         545  105000.0  0.298969         3        2        2         1   
545         546  105000.0  0.298969         3        1        2         1   

     recroom  fullbase  gashw  airco  garag

### Getting the K best x variable

In [16]:
from sklearn.feature_selection import SelectKBest, f_regression
x = df.iloc[:, 2:]  # Selecting all the columns except 'price'
y = df.price
sk = SelectKBest(f_regression,k=3)
sk.fit_transform(x,y)   
# get_feature_names_out(sk)

array([[0.28865979, 1.        , 0.        ],
       [0.16151203, 1.        , 0.        ],
       [0.09690722, 1.        , 0.        ],
       ...,
       [0.29896907, 2.        , 1.        ],
       [0.29896907, 2.        , 1.        ],
       [0.29896907, 1.        , 1.        ]])

### x = df['lotsize'] and y = df['price']

In [17]:
x = df[['lotsize', 'bathrms']]
y = df[['price']]

In [18]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.20, shuffle=True, random_state=1)

In [19]:
xtrain.shape, xtest.shape

((436, 2), (110, 2))

## Machine Learning Model

In [20]:
from sklearn.linear_model import LinearRegression
slr = LinearRegression()
slr.fit(xtrain, ytrain)

### y = ax1 + bx2 + c

In [21]:
slr.coef_, slr.intercept_

(array([[79291.79543446, 22425.40855407]]), array([19550.90830052]))

## Evaluating the Model

In [22]:
y_pred = slr.predict(xtest)

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(ytest, y_pred), mean_squared_error(ytest, y_pred)

(14361.44899252585, 405694575.781917)

In [24]:
from sklearn.metrics import r2_score
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  44.27038114759079 %


## Predication

In [25]:
x_new = [[1500,2]]
x_scaled = mmscaler.fit_transform(x_new)
y_pred = slr.predict(x_scaled)
y_pred



array([[19550.90830052]])

# Step-Wise Regression

In [26]:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs1 = SequentialFeatureSelector(slr, k_features=4,forward=True,scoring='r2')
feat=sfs1.fit(x,y)
names=list(feat.k_feature_names_)
names

AttributeError: k_features must be between 1 and X.shape[1].

In [None]:
x=df[['lotsize','bathrms','airco','prefarea']]
y=df['price']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=1)

In [None]:
slr.fit(xtrain, ytrain)

In [None]:
y_pred = slr.predict(xtest)
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  53.488591564355346 %


# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# x = df[['lotsize', 'bathrms']]
x = df[['lotsize']]
y = df['price']

In [None]:
poly = PolynomialFeatures(degree=2)     # decides the curve
xpoly = poly.fit_transform(x)
xpoly

array([[1.        , 0.28865979, 0.08332448],
       [1.        , 0.16151203, 0.02608614],
       [1.        , 0.09690722, 0.00939101],
       ...,
       [1.        , 0.29896907, 0.08938251],
       [1.        , 0.29896907, 0.08938251],
       [1.        , 0.29896907, 0.08938251]])

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(xpoly, y, test_size=0.2, shuffle=True, random_state=1)

In [None]:
slr.fit(xtrain,ytrain)

In [None]:
y_pred = slr.predict(xtest)

In [None]:
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  30.402229668329227 %


### How to find a model is overfitting or not?

In [None]:
# If the model performs good for the training data and bad for the test data.
# Then we say that the model is overfitting.
# The overfitting model cannot be used for better predictions

### Training Accuracy:  data is compared with y_train

In [None]:
train_pred = slr.predict(xtrain)    
rscore = r2_score(ytrain, train_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  46.77324932447093 %


### Testing Accuracy

In [None]:
test_pred = slr.predict(xtest)
rscore = r2_score(ytest, test_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  44.27038114759079 %


In [None]:
'''
## Shrinkage Methods (Regularization)
1. Lasso Regression(L1 Regularization)
2. Ridge Regression(L2 Regularization)
3. ElasticNet Regression(L1 + L2 Regularization)
'''

'\n## Shrinkage Methods (Regularization)\n1. Lasso Regression(L1 Regularization)\n2. Ridge Regression(L2 Regularization)\n3. ElasticNet Regression(L1 + L2 Regularization)\n'

## Lasso Regression

In [27]:
from sklearn.linear_model import Lasso
la = Lasso()
la.fit(xtrain, ytrain)

In [29]:
y_pred = la.predict(xtest)

In [30]:
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  44.26739426561054 %


## Ridge Regression

In [34]:
from sklearn.linear_model import Ridge
la = Ridge()
la.fit(xtrain, ytrain)

In [35]:
y_pred = la.predict(xtest)

In [36]:
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  43.64852404648606 %


## ElasticNet Regression

In [37]:
from sklearn.linear_model import ElasticNet
la = ElasticNet()
la.fit(xtrain, ytrain)

In [38]:
y_pred = la.predict(xtest)

In [39]:
rscore = r2_score(ytest, y_pred)
print("Accuracy: ", rscore*100, "%")

Accuracy:  13.03515555077891 %
