### Importing relevant libraries 

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import warnings
import missingno as msno
from pycaret.regression import RegressionExperiment
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
# importing datase5 for training
train = pd.read_csv("../data/Housing_dataset_train.csv")
test = pd.read_csv('../data/Housing_dataset_test.csv')

In [4]:
# check data shapes
train.shape, test.shape

((14000, 7), (6000, 6))

In [5]:
# Preview train
train.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
0,3583,Katsina,Semi-detached duplex,2.0,2.0,1.0,1149999.565
1,2748,Ondo,Apartment,,2.0,4.0,1672416.689
2,9261,Ekiti,,7.0,5.0,,3364799.814
3,2224,Anambra,Detached duplex,5.0,2.0,4.0,2410306.756
4,10300,Kogi,Terrace duplex,,5.0,6.0,2600700.898


- From the preview above we can see we have NAN values we would need to treat this as we go further in our training

In [6]:
# Preview train
test.head()

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space
0,845,Kano,Penthouse,4,1,2
1,1924,Adamawa,Apartment,2,2,4
2,10718,Adamawa,Bungalow,2,7,2
3,12076,Lagos,Mansion,9,5,2
4,12254,Gombe,Semi-detached duplex,5,6,1


In [7]:
# Describe the data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14000 entries, 0 to 13999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             14000 non-null  int64  
 1   loc            12187 non-null  object 
 2   title          12278 non-null  object 
 3   bedroom        12201 non-null  float64
 4   bathroom       12195 non-null  float64
 5   parking_space  12189 non-null  float64
 6   price          14000 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 765.8+ KB


In [8]:
# percentage of missing values each feature has..
miss_percentage_train = train.isnull().sum()/train.shape[0]
miss_percentage_train

ID               0.000000
loc              0.129500
title            0.123000
bedroom          0.128500
bathroom         0.128929
parking_space    0.129357
price            0.000000
dtype: float64

In [9]:
# percentage of missing values each feature has..
miss_percentag_test = test.isnull().sum()/test.shape[0]
miss_percentag_test

ID               0.0
loc              0.0
title            0.0
bedroom          0.0
bathroom         0.0
parking_space    0.0
dtype: float64

In [10]:
# which features are categorical?
cat_col = train.select_dtypes(include=[np.object]).columns
# which features are numerical?
num_col = train.select_dtypes(include=[np.number]).columns
print("categorical values:",cat_col )
print("Numerical values:",num_col )

categorical values: Index(['loc', 'title'], dtype='object')
Numerical values: Index(['ID', 'bedroom', 'bathroom', 'parking_space', 'price'], dtype='object')


- We would need to make 'loc', 'title' more descriptive and remove 'ID'

In [12]:
# Checking for unique values 
col=['loc', 'title', 'bedroom', 'bathroom', 'parking_space', 'price']
# Remaining columns are having values like 'yes' and 'no'
for i in col:
    print(i)
    print(train[i].unique())
    print(train[i].nunique())
    print()

loc
['Katsina' 'Ondo' 'Ekiti' 'Anambra' 'Kogi' 'Borno' 'Kwara' 'Osun' 'Kaduna'
 'Ogun' 'Bayelsa' nan 'Abia' 'Rivers' 'Taraba' 'Ebonyi' 'Kebbi' 'Enugu'
 'Edo' 'Nasarawa' 'Delta' 'Kano' 'Yobe' 'Benue' 'Bauchi' 'Cross River'
 'Niger' 'Adamawa' 'Plateau' 'Imo' 'Oyo' 'Zamfara' 'Sokoto' 'Jigawa'
 'Gombe' 'Akwa Ibom' 'Lagos']
36

title
['Semi-detached duplex' 'Apartment' nan 'Detached duplex' 'Terrace duplex'
 'Mansion' 'Bungalow' 'Penthouse' 'Townhouse' 'Flat' 'Cottage']
10

bedroom
[ 2. nan  7.  5.  3.  1.  4.  6.  9.  8.]
9

bathroom
[ 2.  5.  1.  4.  3. nan  7.  6.]
7

parking_space
[ 1.  4. nan  6.  3.  5.  2.]
6

price
[1149999.565 1672416.689 3364799.814 ... 1508351.845 2458443.818
 3348918.718]
10727



### DATA PREPROCESSING...

In [13]:
# droping ID columns from the dataframe
train.drop(["ID"], 1, inplace = True)
test.drop(["ID"], 1, inplace = True)
print("columns after dropping ID (Train dataset)",train.columns)
print("columns after dropping ID (Test dataset)",test.columns)

columns after dropping ID (Train dataset) Index(['loc', 'title', 'bedroom', 'bathroom', 'parking_space', 'price'], dtype='object')
columns after dropping ID (Test dataset) Index(['loc', 'title', 'bedroom', 'bathroom', 'parking_space'], dtype='object')


In [14]:
# making 'loc' and 'title' columns more descriptive
train = train.rename(columns={'loc': 'location', 'title': 'type'})
test= test.rename(columns={'loc': 'location', 'title': 'type'})
print("train",train.columns)
print("test",test.columns)

train Index(['location', 'type', 'bedroom', 'bathroom', 'parking_space', 'price'], dtype='object')
test Index(['location', 'type', 'bedroom', 'bathroom', 'parking_space'], dtype='object')


In [15]:
# Using pandas DataFrame `fillna()` function to fill NaN values

# Filling categorical variables with 'Unknown'
train['location'] = train['location'].fillna('Unknown')
train['type'] = train['type'].fillna('Unknown')

# Filling numerical variables with median
for column in ['bedroom', 'bathroom', 'parking_space']:
    median_value = train[column].median()
    train[column] = train[column].fillna(median_value)

train.head()

Unnamed: 0,location,type,bedroom,bathroom,parking_space,price
0,Katsina,Semi-detached duplex,2.0,2.0,1.0,1149999.565
1,Ondo,Apartment,4.0,2.0,4.0,1672416.689
2,Ekiti,Unknown,7.0,5.0,3.0,3364799.814
3,Anambra,Detached duplex,5.0,2.0,4.0,2410306.756
4,Kogi,Terrace duplex,4.0,5.0,6.0,2600700.898


In [16]:
# encoding categorically data with one-hot encoding 
train_encoded = pd.get_dummies(train, columns=['location', 'type'])
test_encoded = pd.get_dummies(test, columns=['location', 'type'])
train_encoded.head()

Unnamed: 0,bedroom,bathroom,parking_space,price,location_Abia,location_Adamawa,location_Akwa Ibom,location_Anambra,location_Bauchi,location_Bayelsa,...,type_Bungalow,type_Cottage,type_Detached duplex,type_Flat,type_Mansion,type_Penthouse,type_Semi-detached duplex,type_Terrace duplex,type_Townhouse,type_Unknown
0,2.0,2.0,1.0,1149999.565,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,4.0,2.0,4.0,1672416.689,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7.0,5.0,3.0,3364799.814,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,5.0,2.0,4.0,2410306.756,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4,4.0,5.0,6.0,2600700.898,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Model training

In [29]:
baseline_model = RegressionExperiment()
# # initialize setup
baseline_model.setup( data = train, 
                           target = 'price',
                           session_id=2023,
                           fold_strategy = 'stratifiedkfold', # shows worse on validation on this dataset  
                           remove_outliers=True, 
                           transform_target = True,
                           normalize_method='minmax',                
                           # feature_selection = True ,
                           # n_features_to_select = 0.7,
                           # feature_selection_method='univariate', # classic univariate
                           # feature_selection_estimator = 'rf',                         
                           fold = 5,                        
                           use_gpu=True,
                        #    log_experiment= 'dagshub',
                        #    log_data=True,
                        #    log_plots=True,
                        #    experiment_name ='DNS-Hackathon-2023'
                           )

Unnamed: 0,Description,Value
0,Session id,2023
1,Target,price
2,Target type,Regression
3,Original data shape,"(14000, 6)"
4,Transformed data shape,"(13510, 16)"
5,Transformed train set shape,"(9310, 16)"
6,Transformed test set shape,"(4200, 16)"
7,Numeric features,3
8,Categorical features,2
9,Preprocess,True


<pycaret.regression.oop.RegressionExperiment at 0x1740f77e640>

In [30]:
best_models= baseline_model.compare_models(   n_select = 1,
                                                    fold =5,
                                                    sort="RMSE", 
                                                    # turbo = True,
                                                    include=["lightgbm"]
                                                )#include=["lightgbm"]

Processing:   0%|          | 0/9 [00:00<?, ?it/s]

In [28]:
best_models.get_metrics()

AttributeError: 'list' object has no attribute 'get_metrics'