<a href="https://colab.research.google.com/github/sac-1999/Pandas-Cheat-Sheet/blob/main/05_Stratified_Shuffle_Split.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
import os
import tarfile
import urllib.request

In [75]:
house_url = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.tgz'
HOUSING_PATH = os.path.join('datasets', 'housing')

In [76]:
def fetch_housing_data(url = house_url, housing_path = HOUSING_PATH):
  os.makedirs(housing_path , exist_ok = True)
  tgz_path = os.path.join(housing_path , 'housing.tgz')
  urllib.request.urlretrieve(house_url, tgz_path)
  housing_file = tarfile.open(tgz_path)
  housing_file.extractall(housing_path)
  housing_file.close()

In [77]:
fetch_housing_data()

In [78]:
import pandas as pd

In [79]:
def load_housing_csv(housing_path = HOUSING_PATH):
  csv_path = os.path.join(housing_path, 'housing.csv')
  return pd.read_csv(csv_path)

In [80]:
df = load_housing_csv()

In [81]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [82]:
df.ocean_proximity.value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [83]:
import matplotlib.pyplot as plt

In [84]:
#df.hist(bins = 40,figsize=(20,15))

In [85]:
import numpy as np
np.random.permutation(100)

array([54, 88, 32, 66, 34, 40, 42, 74,  8, 86, 92, 69, 97,  4,  9, 41, 37,
       70, 29, 26, 95, 62, 16, 20, 30,  6, 46, 96, 65, 11, 76, 50, 99, 60,
       78,  0, 84, 59, 14, 17, 90, 72, 98, 43,  3, 36, 93, 77, 85, 22, 82,
       18, 38, 15,  7, 24, 28, 89, 12, 94, 35, 47, 44, 79, 31, 53, 21, 27,
        1, 58, 23, 63, 71, 55, 64, 45,  5, 13, 87, 57, 19, 56, 33, 81, 48,
       49, 75,  2, 83, 52, 73, 39, 67, 10, 61, 91, 80, 68, 51, 25])

In [86]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [87]:
df['median_income_category'] = pd.cut(df['median_income'],
       bins= [0., 1.5, 3.0, 4.5, 6., np.inf],
       labels = [1,2,3,4,5])

In [88]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,median_income_category
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3


# Stratified K- Fold Split

In [121]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 0)
final = split.split(df,df['median_income_category'])

In [122]:
for i in final:
  print(i[0].shape, i[1].shape)
  strat_train_set = df.loc[i[0]]
  strat_test_set = df.loc[i[1]]


(16512,) (4128,)


In [138]:
strat_test_set.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,4128.0,4128.0,4128.0,4128.0,4089.0,4128.0,4128.0,4128.0,4128.0
mean,-119.598559,35.66766,28.760901,2656.423692,540.876498,1421.747093,501.770833,3.872602,207568.918605
std,2.000873,2.129195,12.723133,2242.601782,430.847035,1102.5665,390.241035,1.896624,115158.01745
min,-124.26,32.56,2.0,22.0,3.0,9.0,2.0,0.4999,14999.0
25%,-121.83,33.94,18.0,1443.5,292.0,777.0,278.0,2.5665,119950.0
50%,-118.56,34.29,29.0,2112.0,431.0,1163.0,409.0,3.53775,181650.0
75%,-118.02,37.72,37.0,3172.25,648.0,1718.5,604.25,4.7286,264750.0
max,-114.31,41.86,52.0,39320.0,6210.0,16305.0,5358.0,15.0001,500001.0


In [140]:
strat_train_set.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,16512.0,16512.0,16512.0,16512.0,16344.0,16512.0,16512.0,16512.0,16512.0
mean,-119.562491,35.622912,28.609133,2630.597929,537.118514,1426.409157,498.981892,3.870188,206677.541485
std,2.004192,2.137609,12.551133,2166.138917,418.994461,1139.844349,380.336179,1.900677,115457.732747
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.7825,33.93,18.0,1448.75,297.0,789.0,280.0,2.5625,119600.0
50%,-118.49,34.25,29.0,2128.5,436.0,1167.0,410.0,3.5341,179200.0
75%,-118.0,37.71,37.0,3142.0,646.0,1726.0,605.0,4.75,264725.0
max,-114.47,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0
