In [1]:
#loading libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline,make_pipeline

In [2]:
housing = pd.read_csv("Housing2.csv")

In [3]:
housing.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [4]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
housing['furnishingstatus'].value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [6]:
housing.drop(columns=["hotwaterheating","airconditioning"],inplace=True)

In [7]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(housing.drop(columns=["price"]),
                                              housing["price"],
                                              test_size=0.2,
                                              random_state=42)

In [8]:
x_train.head()

Unnamed: 0,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,parking,prefarea,furnishingstatus
46,6000,3,2,4,yes,no,no,1,no,furnished
93,7200,3,2,1,yes,no,yes,3,no,semi-furnished
335,3816,2,1,1,yes,no,yes,2,no,furnished
412,2610,3,1,2,yes,no,yes,0,yes,unfurnished
471,3750,3,1,2,yes,no,no,0,no,unfurnished


In [9]:
y_train.sample()

494    2730000
Name: price, dtype: int64

In [10]:
# one hot encoder
trf1= ColumnTransformer([
    ("ohe_mainroad_guestroom_basement_prefarea_furnishingstatus",OneHotEncoder(sparse= False,handle_unknown = "ignore"),[4,5,6,8,9])
],remainder="passthrough")

In [11]:
#scaling 
trf2 = ColumnTransformer([
    ("scale", MinMaxScaler(),slice(0,15))
])

In [12]:
trf3 = SelectKBest(score_func=chi2,k=10)

In [13]:
trf4= DecisionTreeClassifier()

## Creating pipeline

In [14]:
pipe = Pipeline([
    ("trf1",trf1),
    ("trf2",trf2),
    ("trf3",trf3),
    ("trf4",trf4)
])

In [15]:
pipe.fit(x_train,y_train)



In [16]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_mainroad_guestroom_basement_prefarea_furnishingstatus',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [4, 5, 6, 8, 9])]),
 'trf2': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 15, None))]),
 'trf3': SelectKBest(score_func=<function chi2 at 0x000002301BFF9080>),
 'trf4': DecisionTreeClassifier()}

In [17]:
y_pred= pipe.predict(x_test)

In [18]:
y_pred

array([3150000, 8043000, 2660000, 3500000, 4270000, 3430000, 4270000,
       5390000, 2520000, 2835000, 6300000, 3430000, 2520000, 3080000,
       4200000, 4200000, 3500000, 3360000, 4200000, 2660000, 7420000,
       5229000, 3430000, 3990000, 4123000, 8400000, 4620000, 8645000,
       3290000, 2380000, 8400000, 3360000, 6195000, 3500000, 3360000,
       5600000, 3885000, 4200000, 2520000, 3885000, 2660000, 3430000,
       8043000, 2380000, 2660000, 3430000, 6300000, 3500000, 3430000,
       3129000, 5775000, 3430000, 4270000, 3080000, 4200000, 2835000,
       7962500, 3360000, 3500000, 3430000, 3640000, 4200000, 2660000,
       3129000, 3500000, 3500000, 7840000, 4200000, 6160000, 5040000,
       3150000, 3640000, 4270000, 4025000, 3430000, 5600000, 3500000,
       3118850, 4270000, 2520000, 8043000, 3500000, 5390000, 6125000,
       3360000, 3640000, 3430000, 3500000, 8043000, 4690000, 3080000,
       3290000, 4270000, 2485000, 4970000, 5873000, 4235000, 5775000,
       4200000, 5950

In [19]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.009174311926605505

In [20]:
import pickle
pickle.dump(pipe,open("pipe.pkl","wb"))

In [21]:
#launching the code
pipe= pickle.load(open("pipe.pkl","rb"))

In [22]:
#assuming user input
test_input = np.array([8960,4,4,4,"no","no","no",3,"no","furnished"],dtype= object).reshape(1,10)

In [23]:
pipe.predict(test_input)



array([4200000], dtype=int64)