In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
import pickle

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,MinMaxScaler,StandardScaler
from scipy.stats import shapiro, normaltest, kstest, zscore
import statsmodels.api as sm
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import set_config
set_config(display="diagram")

In [2]:
df = pd.read_csv('Pune_House_Data.csv')

In [3]:
##droping bcoz more than 30% data is missing
df.drop("society",axis=1,inplace=True) 

In [4]:
df[["First","Second"]] =df["size"].str.split(expand=True)

In [5]:
df.drop("size",axis=1,inplace=True) 

In [6]:
def clean(s):
    pattern = re.compile(r"[\d.]+")
    try:
        s = float(s)
    except (ValueError):
        if s.find("-") !=-1:
            s=np.mean([float(i) for i in s.split("-")])
        elif s.endswith('Sq. Meter'):
            s = float(pattern.search(s)[0])*10.7639
        elif s.endswith('Perch'):
            s = float(pattern.search(s)[0])*0.00367309
        elif s.endswith('Sq. Yards'):
            s = float(pattern.search(s)[0])*9
        elif s.endswith('Guntha'):
            s = float(pattern.search(s)[0])*1089
        elif s.endswith('Acres'):
            s = float(pattern.search(s)[0])*43560
        elif s.endswith('ents'):
            s = float(pattern.search(s)[0])*435.56
        elif s.endswith('Grounds'):
            s = float(pattern.search(s)[0]) *2400 
    return s

In [7]:
df["total_sqft"] = df["total_sqft"].apply(clean).astype('int')

In [8]:
df['availability'] = df['availability'].apply(lambda x: x + '-2022')

In [9]:
current_date =datetime.datetime(2022, 1, 1).date()
df['availability'].replace({'Ready To Move-2022':'1-Jan-2022','Immediate Possession-2022':'1-Jan-2022'},inplace=True)

In [10]:
df["availability"] =df["availability"].apply(lambda x :abs((datetime.datetime.strptime(x,"%d-%b-%Y").date())-current_date).days)

In [11]:
df["availability"] = df["availability"]/30

In [12]:
outlier_sqft = df[abs(zscore(df["total_sqft"]))>3].index
df.drop(index = outlier_sqft,inplace=True)

In [13]:
df.drop("Second",axis=1,inplace=True)

In [14]:
df = df.rename(columns={'First':'Bedrooms'})

In [15]:
df["Bedrooms"] = df["Bedrooms"].astype('float')

In [16]:
df = df[["total_sqft","availability","bath","balcony","Bedrooms","area_type","site_location","price"]]
df

Unnamed: 0,total_sqft,availability,bath,balcony,Bedrooms,area_type,site_location,price
0,1056,11.733333,2.0,1.0,2.0,Super built-up Area,Alandi Road,39.07
1,2600,0.000000,5.0,3.0,4.0,Plot Area,Ambegaon Budruk,120.00
2,1440,0.000000,2.0,3.0,3.0,Built-up Area,Anandnagar,62.00
3,1521,0.000000,3.0,1.0,3.0,Super built-up Area,Aundh,95.00
4,1200,0.000000,2.0,1.0,2.0,Super built-up Area,Aundh Road,51.00
...,...,...,...,...,...,...,...,...
13315,3453,0.000000,4.0,0.0,5.0,Built-up Area,Pashan,231.00
13316,3600,0.000000,5.0,,4.0,Super built-up Area,Paud Road,400.00
13317,1141,0.000000,2.0,1.0,2.0,Built-up Area,Pirangut,60.00
13318,4689,5.600000,4.0,1.0,4.0,Super built-up Area,Prabhat Road,488.00


# Transformation

In [17]:
X = df.drop('price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### 2. First Method

pipe01 = Pipeline([("Median_Fillna",SimpleImputer(strategy='median')),("SC",StandardScaler())])
pipe23 = Pipeline([("Mode0_Fillna",SimpleImputer(strategy='most_frequent')),("SC",StandardScaler())])
pipe4 = Pipeline([("Mode1_Fillna",SimpleImputer(strategy='most_frequent')),("SC",StandardScaler())])
pipe56=Pipeline([("Mode2_Fillna",SimpleImputer(strategy='most_frequent')),("AT_OHE",OneHotEncoder(handle_unknown='ignore',sparse=False))])

CT = ColumnTransformer([('pipe1',pipe01,[0,1]),('pipe2',pipe23,[2,3]),('pipe4',pipe4,[4]),('pipe56',pipe56,[5,6])])

pipe_lr = Pipeline([('Col_Trans',CT),('model_lr',LinearRegression())])

### 2. Second Method

In [18]:
#Missing Value Transformer
MVT = ColumnTransformer([("Median_Fillna",SimpleImputer(strategy='median'),[0,1]),
                        ("Mode1_Fillna",SimpleImputer(strategy='most_frequent'),[2,3]),
                        ("0_Fillna",SimpleImputer(fill_value=0,strategy = "constant"),[4]),
                        ("Mode2_Fillna",SimpleImputer(strategy='most_frequent'),[5,6])],remainder="passthrough")

In [19]:
#Scaling Transformer
ST = ColumnTransformer([("SC",StandardScaler(),[0,1,2,3,4])],remainder='passthrough')

In [20]:
#One Hot Encoding Transformer
OHT = ColumnTransformer([("AT_OHE",OneHotEncoder(sparse=False,drop="first",handle_unknown='ignore'),[5,6])], remainder='passthrough')
model = LinearRegression()

In [21]:
pipe_lr = Pipeline([('Missing_Value', MVT),
                    ('Scaling', ST),
                    ("OHE", OHT),
                    ("LR",model)
                   ])

### Model Training

In [22]:
pipe_lr.fit(X_train,y_train)

ValueError: `handle_unknown` must be 'error' when the drop parameter is specified, as both would create categories that are all zero.

In [None]:
y_pred_test = pipe_lr.predict(X_test)
y_pred_train = pipe_lr.predict(X_train)

In [None]:
pipe_lr.predict([[1000,0.0,2.0,1.0,5.0,"Super built-up Area","Ganesh Peth"]])

In [None]:
pickle.dump(pipe_lr,open("hpp_lr_pipe.pkl",'wb'))

In [None]:
pickle.load(open("hpp_lr_pipe.pkl",'rb')).predict([[1000,0.0,2.0,1.0,5.0,"Super built-up Area","Ganesh Peth"]])

In [None]:
# p = 13311
# X_train.loc[p] = [1056,11.733333,2.0,1.0,2.0,"Super built-up Area","Alandi Road"]

In [None]:
X_train