# 기본 설정

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.mode.copy_on_write = True
%precision 6
pd.set_option('display.precision', 6)
data_url = 'https://raw.githubusercontent.com/codingalzi/DataSci/refs/heads/master/data/'

# 10장 요약

In [91]:
housing = pd.read_csv(data_url+"california_housing.csv")
housing

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [92]:
housing = housing.iloc[:,[7,8]]
housing

Unnamed: 0,median_income,median_house_value
0,8.3252,452600.0
1,8.3014,358500.0
2,7.2574,352100.0
3,5.6431,341300.0
4,3.8462,342200.0
...,...,...
20635,1.5603,78100.0
20636,2.5568,77100.0
20637,1.7000,92300.0
20638,1.8672,84700.0


In [93]:
hv_max = housing['median_house_value'].max()
mask = housing['median_house_value'] >= hv_max
housing = housing[~mask]
housing = housing.reset_index(drop=True)
housing.index.name = 'index'
housing

Unnamed: 0_level_0,median_income,median_house_value
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8.3252,452600.0
1,8.3014,358500.0
2,7.2574,352100.0
3,5.6431,341300.0
4,3.8462,342200.0
...,...,...
19670,1.5603,78100.0
19671,2.5568,77100.0
19672,1.7000,92300.0
19673,1.8672,84700.0


In [94]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
housing

Unnamed: 0_level_0,median_income,median_house_value,income_cat
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8.3252,452600.0,5
1,8.3014,358500.0,5
2,7.2574,352100.0,5
3,5.6431,341300.0,4
4,3.8462,342200.0,3
...,...,...,...
19670,1.5603,78100.0,2
19671,2.5568,77100.0,2
19672,1.7000,92300.0,2
19673,1.8672,84700.0,2


In [108]:
stratified_count = housing.groupby('income_cat',observed=True).count()
stratified_count

Unnamed: 0_level_0,median_income,median_house_value
income_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,814,814
2,6552,6552
3,7103,7103
4,3502,3502
5,1704,1704


In [101]:
random_sampling = housing.sample(frac=0.1, random_state=42)
random_sampling

Unnamed: 0_level_0,median_income,median_house_value,income_cat
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14447,1.8357,104200.0,2
13921,4.2109,171200.0,3
12981,4.0481,97300.0,3
2579,3.5380,102700.0,3
12162,2.2000,116500.0,2
...,...,...,...
19377,6.6246,284200.0,5
485,2.9405,289500.0,2
5018,1.6027,97300.0,2
967,5.5000,247200.0,4


In [107]:
random_sampling_count = random_sampling.groupby('income_cat',observed=True).count()
random_sampling_count

Unnamed: 0_level_0,median_income,median_house_value
income_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,87,87
2,644,644
3,702,702
4,347,347
5,188,188


In [106]:
random_sampling_ratio = random_sampling_count / random_sampling_count.sum()
random_sampling_ratio

Unnamed: 0_level_0,median_income,median_house_value
income_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.044207,0.044207
2,0.327236,0.327236
3,0.356707,0.356707
4,0.176321,0.176321
5,0.095528,0.095528


In [109]:
stratified_ratio = stratified_count / stratified_count.sum()
stratified_ratio

Unnamed: 0_level_0,median_income,median_house_value
income_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.041372,0.041372
2,0.333011,0.333011
3,0.361017,0.361017
4,0.177992,0.177992
5,0.086607,0.086607


In [124]:
proportions = pd.concat([stratified_ratio.iloc[:,[1]],
                        random_sampling_ratio.iloc[:,[1]]],
                        axis=1)
proportions.columns = ['전체','무작위']
proportions.index.name = '소득구간'
proportions

Unnamed: 0_level_0,전체,무작위
소득구간,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.041372,0.044207
2,0.333011,0.327236
3,0.361017,0.356707
4,0.177992,0.176321
5,0.086607,0.095528


# 11.1 확률분포