# Flat Price Classification

## Raw data

In [1]:
import pandas as pd

df_raw = pd.read_csv('train_data.csv')
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4124 entries, 0 to 4123
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   SalePrice                  4124 non-null   int64  
 1   YearBuilt                  4124 non-null   int64  
 2   Size(sqf)                  4124 non-null   int64  
 3   Floor                      4124 non-null   int64  
 4   HallwayType                4124 non-null   object 
 5   HeatingType                4124 non-null   object 
 6   AptManageType              4124 non-null   object 
 7   N_Parkinglot(Ground)       4124 non-null   float64
 8   N_Parkinglot(Basement)     4124 non-null   float64
 9   TimeToBusStop              4124 non-null   object 
 10  TimeToSubway               4124 non-null   object 
 11  N_manager                  4124 non-null   float64
 12  N_elevators                4124 non-null   float64
 13  SubwayStation              4124 non-null   objec

In [2]:
df_raw.head()

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,TimeToSubway,N_manager,N_elevators,SubwayStation,N_FacilitiesInApt,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total)
0,141592,2006,814,3,terraced,individual_heating,management_in_trust,111.0,184.0,5min~10min,10min~15min,3.0,0.0,Kyungbuk_uni_hospital,5,6.0,9.0
1,51327,1985,587,8,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
2,48672,1985,587,6,corridor,individual_heating,self_management,80.0,76.0,0~5min,5min~10min,2.0,2.0,Daegu,3,12.0,4.0
3,380530,2006,2056,8,terraced,individual_heating,management_in_trust,249.0,536.0,0~5min,0-5min,5.0,11.0,Sin-nam,5,3.0,7.0
4,78318,1992,644,2,mixed,individual_heating,self_management,142.0,79.0,5min~10min,15min~20min,4.0,8.0,Myung-duk,3,9.0,14.0


In [3]:
df_raw.isnull().sum()

SalePrice                    0
YearBuilt                    0
Size(sqf)                    0
Floor                        0
HallwayType                  0
HeatingType                  0
AptManageType                0
N_Parkinglot(Ground)         0
N_Parkinglot(Basement)       0
TimeToBusStop                0
TimeToSubway                 0
N_manager                    0
N_elevators                  0
SubwayStation                0
N_FacilitiesInApt            0
N_FacilitiesNearBy(Total)    0
N_SchoolNearBy(Total)        0
dtype: int64

## Data preprocessing

In [27]:
X = df_raw.drop(columns=['SalePrice'])
X.columns = X.columns.str.strip().str.replace(' ', '_')

y = df_raw['SalePrice']

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f'{X_train.shape=}')
print(f'{X_val.shape=}')
print(f'{X_test.shape=}')

X_train.shape=(3299, 16)
X_val.shape=(412, 16)
X_test.shape=(413, 16)


In [30]:
print("Unique values:\n")
for column in X_train.select_dtypes(include='object').columns:
    print(f'{column}: {X[column].unique().tolist()}')

Unique values:

HallwayType: ['terraced', 'corridor', 'mixed']
HeatingType: ['individual_heating', 'central_heating']
AptManageType: ['management_in_trust', 'self_management']
TimeToBusStop: ['5min~10min', '0~5min', '10min~15min']
TimeToSubway: ['10min~15min', '5min~10min', '0-5min', '15min~20min', 'no_bus_stop_nearby']
SubwayStation: ['Kyungbuk_uni_hospital', 'Daegu', 'Sin-nam', 'Myung-duk', 'Chil-sung-market', 'Bangoge', 'Banwoldang', 'no_subway_nearby']


### One-hot encoding

The given features do not have a fixed order or other such dependencies. That is why I encode them using one-hot encoding.

In [None]:
print("Unique values:\n")
for column in X_train.select_dtypes(include='object').columns:
    print(f'{column}: {X[column].unique().tolist()}')

### One-hot encoding

The given features do not have a fixed order or other such dependencies. That is why I encode them using one-hot encoding.

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')

features = ['HallwayType', 'HeatingType', 'AptManageType']


# Transformacja danych treningowych
X_train_encode = encoder.fit_transform(X_train[features])
X_val_encode = encoder.fit_transform(X_train[features])
X_train_encode = encoder.fit_transform(X_train[features])

# # Transformacja nowych danych używając tego samego enkodera
# X_test_encoded_array = encoder.transform(X_test[['kategoria1', 'kategoria2']])

# Konwersja z powrotem do DataFrame, jeśli potrzebne
feature_names = encoder.get_feature_names_out(features)
X_train_encoded = pd.DataFrame(X_train_encoded_array, columns=feature_names)
# X_test_encoded = pd.DataFrame(X_test_encoded_array, columns=feature_names)

X_train_encoded

Unnamed: 0,HallwayType_mixed,HallwayType_terraced,HeatingType_individual_heating,AptManageType_self_management
0,0.0,1.0,1.0,0.0
1,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0
3,0.0,1.0,1.0,0.0
4,1.0,0.0,1.0,1.0
...,...,...,...,...
4119,0.0,1.0,1.0,0.0
4120,0.0,1.0,1.0,0.0
4121,0.0,1.0,1.0,0.0
4122,0.0,0.0,1.0,1.0
