# Polynomial Regression

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [4]:
filepath = r"D:\ML_Tutorials\house_rent_ds\House_Rent_Dataset.csv"
data = pd.read_csv(filepath)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [6]:
data.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


## Glossary

BHK: Number of Bedrooms, Hall, Kitchen.

Rent: Rent of the Houses/Apartments/Flats.

Size: Size of the Houses/Apartments/Flats in Square Feet.

Floor: Houses/Apartments/Flats situated in which Floor and Total Number of Floors (Example: Ground out of 2, 3 out of 5, etc.)

Area Type: Size of the Houses/Apartments/Flats calculated on either Super Area or Carpet Area or Build Area.

Area Locality: Locality of the Houses/Apartments/Flats.

City: City where the Houses/Apartments/Flats are Located.

Furnishing Status: Furnishing Status of the Houses/Apartments/Flats, either it is Furnished or Semi-Furnished or Unfurnished.

Tenant Preferred: Type of Tenant Preferred by the Owner or Agent.

Bathroom: Number of Bathrooms.

Point of Contact: Whom should you contact for more information regarding the Houses/Apartments/Flats.

In [12]:
# sns.pairplot(data)

In [31]:
req_features = ['BHK', 'Size', 'Furnishing Status', 'Bathroom', 'City', 'Rent']
# target_feature = 'Rent'

In [32]:
# Selecting relevant columns and removing outliers (Rent above 1 lakh)
df_filtered = data[data["Rent"] < 100000]

In [33]:
# df_filtered = df_filtered[req_features + [target_feature]]
df_filtered = df_filtered[req_features]

In [34]:
df_filtered.head()

Unnamed: 0,BHK,Size,Furnishing Status,Bathroom,City,Rent
0,2,1100,Unfurnished,2,Kolkata,10000
1,2,800,Semi-Furnished,1,Kolkata,20000
2,2,1000,Semi-Furnished,1,Kolkata,17000
3,2,800,Unfurnished,1,Kolkata,10000
4,2,850,Unfurnished,1,Kolkata,7500


In [35]:
# One-hot encoding categorical variables
encoder = ColumnTransformer(transformers=[
    ("onehot", OneHotEncoder(drop="first"), ["Furnishing Status", "City"])
], remainder="passthrough")

In [36]:
encoder

In [37]:
# Splitting data into training and testing sets
X = df_filtered.drop(columns=['Rent'])
y = df_filtered['Rent']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# Creating a pipeline with Polynomial Regression (degree=2)
poly_pipeline = make_pipeline(
    encoder,
    PolynomialFeatures(degree=3, include_bias=False),
    LinearRegression()
)

In [47]:
# Training the model
poly_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [48]:
# Predictions
y_train_pred = poly_pipeline.predict(X_train)
y_test_pred = poly_pipeline.predict(X_test)

In [49]:
# Evaluating the improved model
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

r2_train, r2_test, rmse_test

(0.729843534799085, 0.7045404959738741, np.float64(10201.824906462489))