In [1]:
import pandas as pd
from google.colab import files

file = files.upload()
data = pd.read_csv('House Price Prediction Dataset.csv')

Saving House Price Prediction Dataset.csv to House Price Prediction Dataset.csv


**Exploratory Data Analysis**

In [3]:
data.describe()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Price
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,2786.2095,3.0035,2.5525,1.9935,1961.446,537676.855
std,577.494589,1295.146799,1.424606,1.10899,0.809188,35.926695,276428.845719
min,1.0,501.0,1.0,1.0,1.0,1900.0,50005.0
25%,500.75,1653.0,2.0,2.0,1.0,1930.0,300098.0
50%,1000.5,2833.0,3.0,3.0,2.0,1961.0,539254.0
75%,1500.25,3887.5,4.0,4.0,3.0,1993.0,780086.0
max,2000.0,4999.0,5.0,4.0,3.0,2023.0,999656.0


In [2]:
data.head()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
0,1,1360,5,4,3,1970,Downtown,Excellent,No,149919
1,2,4272,5,4,3,1958,Downtown,Excellent,No,424998
2,3,3592,2,2,3,1938,Downtown,Good,No,266746
3,4,966,4,2,2,1902,Suburban,Fair,Yes,244020
4,5,4926,1,4,2,1975,Downtown,Fair,Yes,636056


In [15]:
data.tail()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Location,Condition,Garage,Price
1995,1996,4994,5,4,3,1923,Suburban,Poor,No,295620
1996,1997,3046,5,2,1,2019,Suburban,Poor,Yes,580929
1997,1998,1062,5,1,2,1903,Rural,Poor,No,476925
1998,1999,4062,3,1,2,1936,Urban,Excellent,Yes,161119
1999,2000,2989,5,1,3,1903,Suburban,Fair,No,482525


In [6]:
kolom = data.shape[1]
baris = data.shape[0]

kolom,baris

(10, 2000)

In [9]:
data.columns

Index(['Id', 'Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt',
       'Location', 'Condition', 'Garage', 'Price'],
      dtype='object')

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Id         2000 non-null   int64 
 1   Area       2000 non-null   int64 
 2   Bedrooms   2000 non-null   int64 
 3   Bathrooms  2000 non-null   int64 
 4   Floors     2000 non-null   int64 
 5   YearBuilt  2000 non-null   int64 
 6   Location   2000 non-null   object
 7   Condition  2000 non-null   object
 8   Garage     2000 non-null   object
 9   Price      2000 non-null   int64 
dtypes: int64(7), object(3)
memory usage: 156.4+ KB


**Missing Values**

In [11]:
missing_columns = data.isnull().sum()
missing_columns

Unnamed: 0,0
Id,0
Area,0
Bedrooms,0
Bathrooms,0
Floors,0
YearBuilt,0
Location,0
Condition,0
Garage,0
Price,0


**There are no missing values in this data**

**But we have to delete the Id column because the value of this column is unique and can cause data leakage**

In [40]:
drop_columns = ['Id','Price']

**PREPROCESSING PIPELINE**

In [49]:
from seaborn import categorical
from sklearn.model_selection import train_test_split

X = data.drop(columns=drop_columns)
y = data.Price


TrainX,ValX,TrainY,ValY = train_test_split(X,y,
                                           test_size=0.3,
                                           random_state= 42)


numeric_features = TrainX.select_dtypes(include=['int64']).columns
categorical_features = TrainX.select_dtypes(include=['object']).columns


TrainX.shape,ValX.shape

((1400, 8), (600, 8))

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

numeric_transform = Pipeline(steps=[
    ('scaler',StandardScaler())
])

categorical_transform = Pipeline(steps=[
    ('OneHot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])

In [52]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers = [
        ('num',numeric_transform,numeric_features),
        ('cat',categorical_transform,categorical_features)
    ]
)

In [54]:
CekTrainX = preprocessor.fit_transform(TrainX)
CekValX = preprocessor.transform(ValX)

**MUTUAL INFORMATION**

In [57]:
TrainXMI = preprocessor.fit_transform(TrainX)

feature_name_numeric = numeric_features
feature_name_category =(
    preprocessor
    .named_transformers_['cat']
    .named_steps['OneHot']
    .get_feature_names_out(categorical_features)
)

features_names = list(feature_name_numeric) + list(feature_name_category)

In [60]:
from sklearn.feature_selection import mutual_info_regression
import pandas as pd

mi_score = mutual_info_regression(
    TrainXMI,TrainY,random_state=42)

mi_series = pd.Series(mi_score,
                      index=features_names)

mi_series = mi_series.sort_values(ascending=False)

mi_series

Unnamed: 0,0
Garage_No,0.013885
Garage_Yes,0.013885
Location_Downtown,0.005743
Condition_Poor,0.004388
Condition_Fair,0.003677
Location_Rural,0.00282
Bathrooms,0.002301
Condition_Excellent,0.001257
YearBuilt,0.0
Area,0.0
