In [44]:
import kagglehub

# 데이터셋 다운로드
path = kagglehub.dataset_download("martinfrederiksen/danish-residential-housing-prices-1992-2024")

# 경로 출력
print("Path to dataset files:", path)

Path to dataset files: /Users/josuhyeon/.cache/kagglehub/datasets/martinfrederiksen/danish-residential-housing-prices-1992-2024/versions/3


In [47]:
import pandas as pd
import os

# 정확한 파일 경로와 파일명 지정
file_path = "/Users/josuhyeon/.cache/kagglehub/datasets/martinfrederiksen/danish-residential-housing-prices-1992-2024/versions/3/DKHousingPricesSample100k.csv"

# 데이터 불러오기
df = pd.read_csv(file_path)

# 데이터 확인
print("데이터셋 상위 5개 행:")
print(df.head())
print("결측값 확인:")
print(df.isnull().sum())

데이터셋 상위 5개 행:
         date quarter  house_id house_type    sales_type  year_build  \
0  2024-10-24  2024Q4        16      Villa  regular_sale        1997   
1  2024-10-24  2024Q4        13  Apartment  regular_sale        1885   
2  2024-10-23  2024Q4        60      Villa  regular_sale        1949   
3  2024-10-23  2024Q4        29      Villa  regular_sale        2001   
4  2024-10-22  2024Q4        92  Apartment  regular_sale        1965   

   purchase_price  %_change_between_offer_and_purchase  no_rooms    sqm  \
0         6500000                                 -3.0         5  142.0   
1         3400000                                  0.0         2   46.0   
2         4550000                                 -4.0         4  112.0   
3         1630000                                -12.0         4  186.0   
4         1975000                                -10.0         3   82.0   

   sqm_price                          address  zip_code             city  \
0  45774.650              

In [49]:
# 데이터셋 열 이름 확인
print("열 이름 확인:")
print(df.columns)

열 이름 확인:
Index(['date', 'quarter', 'house_id', 'house_type', 'sales_type', 'year_build',
       'purchase_price', '%_change_between_offer_and_purchase', 'no_rooms',
       'sqm', 'sqm_price', 'address', 'zip_code', 'city', 'area', 'region',
       'nom_interest_rate%', 'dk_ann_infl_rate%',
       'yield_on_mortgage_credit_bonds%'],
      dtype='object')


In [50]:
# 타겟 변수 설정
target_column = 'purchase_price'

# 특성과 타겟 분리
X = df.drop(columns=[target_column])  # 타겟 컬럼 제외
y = df[target_column]

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# 결측값 처리
df['city'] = df['city'].fillna(df['city'].mode()[0])
df['dk_ann_infl_rate%'].fillna(df['dk_ann_infl_rate%'].mean())
df['yield_on_mortgage_credit_bonds%'].fillna(df['yield_on_mortgage_credit_bonds%'].mean())

# 범주형 및 수치형 컬럼 분리
categorical_columns = ['house_type', 'sales_type', 'city', 'region']
numerical_columns = ['year_build', '%_change_between_offer_and_purchase', 
                     'no_rooms', 'sqm', 'sqm_price', 'nom_interest_rate%', 
                     'dk_ann_infl_rate%', 'yield_on_mortgage_credit_bonds%']

# 데이터 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

# 훈련/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 변환
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("데이터 전처리 완료!")

데이터 전처리 완료!


In [15]:
# 불필요한 컬럼 제거
df = df.drop(['date', 'address', 'zip_code'], axis=1)

# 최종 데이터프레임 확인
print("최종 데이터프레임 컬럼:")
print(df.columns)

최종 데이터프레임 컬럼:
Index(['quarter', 'house_id', 'purchase_price',
       '%_change_between_offer_and_purchase', 'no_rooms', 'sqm', 'sqm_price',
       'city', 'nom_interest_rate%', 'dk_ann_infl_rate%',
       'yield_on_mortgage_credit_bonds%', 'house_age', 'house_type_Apartment',
       'house_type_Farm', 'house_type_Summerhouse', 'house_type_Townhouse',
       'house_type_Villa', 'sales_type_auction', 'sales_type_family_sale',
       'sales_type_other_sale', 'sales_type_regular_sale', 'area_Bornholm',
       'area_Capital, Copenhagen', 'area_East & mid jutland',
       'area_Fyn & islands', 'area_North Zealand', 'area_North jutland',
       'area_Other islands', 'area_South jutland', 'region_Bornholm',
       'region_Fyn & islands', 'region_Jutland', 'region_Zealand'],
      dtype='object')


In [16]:
# 입력(X)과 타겟(y) 설정
X = df.drop('purchase_price', axis=1)  # 입력 데이터
y = df['purchase_price']              # 타겟 (집값)

# 학습 데이터와 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 분할 확인
print("학습 데이터 크기:", X_train.shape)
print("테스트 데이터 크기:", X_test.shape)

학습 데이터 크기: (80000, 32)
테스트 데이터 크기: (20000, 32)


In [18]:
from sklearn.preprocessing import StandardScaler

# 숫자형 데이터만 선택 (dtype이 float 또는 int인 컬럼만 선택)
numerical_features = X_train.select_dtypes(include=['float64', 'int64']).columns

# 스케일러 생성
scaler = StandardScaler()

# 숫자형 데이터에만 스케일링 적용
X_train_scaled = scaler.fit_transform(X_train[numerical_features])
X_test_scaled = scaler.transform(X_test[numerical_features])

# 스케일링 후 데이터프레임으로 변환 (옵션)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=numerical_features, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=numerical_features, index=X_test.index)

# 확인
print("스케일링 후 X_train:")
print(X_train_scaled.head())

스케일링 후 X_train:
       house_id  %_change_between_offer_and_purchase  no_rooms       sqm  \
75220  0.860906                             0.430126  2.788892  1.133532   
48955 -0.055991                             0.017005 -2.037224 -0.672477   
44966 -0.199174                            -1.015797 -0.830695 -0.269193   
13568 -1.288765                             0.430126  1.582363  0.274363   
92727  1.471703                             0.430126 -1.433960 -1.356305   

       sqm_price  nom_interest_rate%  dk_ann_infl_rate%  \
75220  -0.923619            0.154945          -0.064722   
48955   0.214443           -0.820209          -0.902112   
44966  -0.687336           -0.820209          -1.024359   
13568   0.414095           -0.820209           3.529331   
92727  -0.668836            0.764416           0.155322   

       yield_on_mortgage_credit_bonds%  house_age  
75220                         0.113857  -0.325901  
48955                        -0.605416   0.375295  
44966           

In [32]:
# 'quarter' 컬럼 분리: 연도와 분기 숫자 추출
X_train['year'] = X_train['quarter'].str[:4].astype(int)  # 연도 부분 추출
X_train['quarter_num'] = X_train['quarter'].str[-1].astype(int)  # 분기 숫자 부분 추출

X_test['year'] = X_test['quarter'].str[:4].astype(int)
X_test['quarter_num'] = X_test['quarter'].str[-1].astype(int)

# 원래 'quarter' 컬럼 삭제
X_train = X_train.drop(columns=['quarter'])
X_test = X_test.drop(columns=['quarter'])

# 변환 후 확인
print(X_train.head())
print(X_test.head())

       house_id  purchase_price  %_change_between_offer_and_purchase  \
75220   1127663          735000                                  0.0   
48955    728835         1760000                                 -2.0   
44966    666554          800000                                 -7.0   
13568    192608         3200000                                  0.0   
92727   1393345          378058                                  0.0   

       no_rooms    sqm  sqm_price           city  nom_interest_rate%  \
75220         9  194.0   3788.660         Struer                2.00   
48955         1   91.0  19340.660       Aarhus C                0.00   
44966         3  114.0   7017.544           Ejby                0.00   
13568         7  145.0  22068.965  Frederikshavn                0.00   
92727         2   52.0   7270.346        Thisted                3.25   

       dk_ann_infl_rate%  yield_on_mortgage_credit_bonds%  ...  \
75220               1.82                             4.36  ...   
48

In [41]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# 범주형 컬럼 정의
categorical_columns = ['city', 'house_type', 'sales_type', 'region']  # 예시 범주형 컬럼

# ColumnTransformer 설정
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로 유지
)

# 훈련 데이터에 fit_transform 적용
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# OneHotEncoder가 추가한 열 이름 가져오기
ohe_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_columns)
new_feature_names = list(ohe_feature_names) + list(X_train.drop(columns=categorical_columns).columns)

# 스케일링 및 데이터 프레임화
X_train_final = pd.DataFrame(X_train_transformed, columns=new_feature_names)
X_test_final = pd.DataFrame(X_test_transformed, columns=new_feature_names)

# 확인
print(X_train_final.head())
print(X_test_final.head())

ValueError: A given column is not a column of the dataframe

In [42]:
print(X_train.dtypes)
print(X_train.head())

house_id                                 int64
purchase_price                           int64
%_change_between_offer_and_purchase    float64
no_rooms                                 int64
sqm                                    float64
                                        ...   
city_Ãrre                                bool
city_Ãrslev                              bool
city_Ãrøskøbing                          bool
city_Årre                                 bool
city_Ølstykke                             bool
Length: 764, dtype: object
       house_id  purchase_price  %_change_between_offer_and_purchase  \
75220   1127663          735000                                  0.0   
48955    728835         1760000                                 -2.0   
44966    666554          800000                                 -7.0   
13568    192608         3200000                                  0.0   
92727   1393345          378058                                  0.0   

       no_rooms    sqm 

In [43]:
from sklearn.linear_model import LogisticRegression

# 모델 생성 및 학습
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# 예측
y_pred = log_reg.predict(X_test)

# 결과 출력
print("모델 예측 완료!")

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- city_Bjørnø
Feature names seen at fit time, yet now missing:
- city_Barsø
- city_Lyø
- city_Odense NØ
- city_Randers NØ
- city_Redsted M
- ...
