In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re
import pickle

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,MinMaxScaler,StandardScaler
from scipy.stats import shapiro, normaltest, kstest, zscore
import statsmodels.api as sm
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import set_config
set_config(display="diagram")

# 1 Problem Statement

'Predict Price of flats using different Independent Variables'

# 2 Data Gathering

In [50]:
df = pd.read_csv('Pune_House_Data.csv')

# 3 EDA: Exploratory Data Analysis

### 3.1 Getting idea about how many null values are present in each

In [51]:
df.drop("society",axis=1,inplace=True)  ##droping bcoz more than 30% data is missing

### 3.2 area_type

In [52]:
df["area_type"].value_counts()

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64

### 3.4 size

In [53]:
df[["First","Second"]] =df["size"].str.split(expand=True)

In [54]:
df.drop("size",axis=1,inplace=True) 

### 3.5 total_sqft

In [55]:

def clean(s):
    pattern = re.compile(r"[\d.]+")
    try:
        s = float(s)
        
    except (ValueError):
        if s.find("-") !=-1:
            s=np.mean([float(i) for i in s.split("-")])
        elif s.endswith('Sq. Meter'):
            s = float(pattern.search(s)[0])*10.7639
        elif s.endswith('Perch'):
            s = float(pattern.search(s)[0])*0.00367309
        elif s.endswith('Sq. Yards'):
            s = float(pattern.search(s)[0])*9
        elif s.endswith('Guntha'):
            s = float(pattern.search(s)[0])*1089
        elif s.endswith('Acres'):
            s = float(pattern.search(s)[0])*43560
        elif s.endswith('ents'):
            s = float(pattern.search(s)[0])*435.56
        elif s.endswith('Grounds'):
            s = float(pattern.search(s)[0]) *2400 
    return s
            

In [56]:
df["total_sqft"] = df["total_sqft"].apply(clean).astype('int')

In [57]:
df['availability'] = df['availability'].apply(lambda x: x + '-2022')

In [58]:
current_date =datetime.datetime(2022, 1, 1).date()
df['availability'].replace({'Ready To Move-2022':'1-Jan-2022','Immediate Possession-2022':'1-Jan-2022'},inplace=True)

In [59]:
df["availability"] =df["availability"].apply(lambda x :abs((datetime.datetime.strptime(x,"%d-%b-%Y").date())-current_date).days)

In [60]:
df["availability"] = df["availability"]/30

In [61]:
outlier_sqft = df[abs(zscore(df["total_sqft"]))>3].index
df.drop(index = outlier_sqft,inplace=True)

In [62]:
df.drop("Second",axis=1,inplace=True)

In [63]:
df = df.rename(columns={'First':'Bedrooms'})

In [68]:
df["Bedrooms"] = df["Bedrooms"].astype('float')

In [72]:
df = df[["total_sqft","bath","availability","balcony","Bedrooms","area_type","site_location","price"]]
df

Unnamed: 0,total_sqft,bath,availability,balcony,Bedrooms,area_type,site_location,price
0,1056,2.0,11.733333,1.0,2.0,Super built-up Area,Alandi Road,39.07
1,2600,5.0,0.000000,3.0,4.0,Plot Area,Ambegaon Budruk,120.00
2,1440,2.0,0.000000,3.0,3.0,Built-up Area,Anandnagar,62.00
3,1521,3.0,0.000000,1.0,3.0,Super built-up Area,Aundh,95.00
4,1200,2.0,0.000000,1.0,2.0,Super built-up Area,Aundh Road,51.00
...,...,...,...,...,...,...,...,...
13315,3453,4.0,0.000000,0.0,5.0,Built-up Area,Pashan,231.00
13316,3600,5.0,0.000000,,4.0,Super built-up Area,Paud Road,400.00
13317,1141,2.0,0.000000,1.0,2.0,Built-up Area,Pirangut,60.00
13318,4689,4.0,5.600000,1.0,4.0,Super built-up Area,Prabhat Road,488.00


# Transformation

In [71]:
X = df.drop('price',axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
#Missing Value Transformer
MVT = ColumnTransformer(transformers=[("Median_Fillna",SimpleImputer(strategy='median'),[0,1,2]),
                                      ("Mode_Fillna",SimpleImputer(strategy='most_frequent'),[3]),
                                      ("0_Fillna",SimpleImputer(fill_value=0,strategy = "constant"),[4])
                                      ("Mode_Fillna",SimpleImputer(strategy='most_frequent'),[5,6])])

In [20]:
#Scaling Transformer [0123456] -> [1234605]
ST = ColumnTransformer(transformers=[("SC",StandardScaler(),[1,2,3,4,6])],remainder='passthrough')

In [21]:
#One Hot Encoding Transformer [1234605] -> [0512346]
OHT = ColumnTransformer(transformers=[("AT_OHE",OneHotEncoder(sparse=False,drop='first'),[5]),
                  ("SL_OHE",OneHotEncoder(sparse=False,drop='first'),[6])], remainder='passthrough')
OHT.fit_transform

<bound method ColumnTransformer.fit_transform of ColumnTransformer(remainder='passthrough',
                  transformers=[('AT_OHE',
                                 OneHotEncoder(drop='first', sparse=False),
                                 [5]),
                                ('SL_OHE',
                                 OneHotEncoder(drop='first', sparse=False),
                                 [6])])>

In [22]:
pipe_lr = Pipeline(steps = [('Missing_Value', MVT), ('Scaling', ST), ("OHE", OHT), ("LR",LinearRegression())])
pipe_lr.fit(X_train,y_train)
y_pred_test = pipe_lr.predict(X_test)
y_pred_train = pipe_lr.predict(X_train)

In [29]:
pickle.dump(pipe_lr,open("hpp_lr_pipe.pkl",'wb'))

In [38]:
pickle.load(open("hpp_lr_pipe.pkl",'rb')).predict(df_temp)

ValueError: Found unknown categories ['Super built-up Area'] in column 0 during transform

In [37]:
df_temp = pd.DataFrame([["Super built-up Area",11.733333,1056,2.0,1.0,"Alandi Road",2.0]],columns=X_train.columns)