In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/Rstam59/TaskDataRepoForStudents/refs/heads/main/housing.csv')

In [3]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [4]:
if df['median_house_value'].isna().sum() > 0:
    df.dropna(subset='median_house_value')

In [5]:
X = df.drop(['median_house_value'], axis=1)
y = df['median_house_value'].copy()

In [6]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,INLAND


In [7]:
y

Unnamed: 0,median_house_value
0,452600.0
1,358500.0
2,352100.0
3,341300.0
4,342200.0
...,...
20635,78100.0
20636,77100.0
20637,92300.0
20638,84700.0


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 9), (4128, 9), (16512,), (4128,))

In [11]:
len(X_train.columns), len(X_test.columns)

(9, 9)

In [12]:
num_features = X_train.select_dtypes(include=[np.number]).columns

In [13]:
categ_features = X_train.select_dtypes(include='object').columns
# X_train.select_dtypes(exclude=[np.number]).columns

In [14]:
#from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [15]:
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

categ_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('object', categ_pipeline, categ_features)
], remainder='passthrough')

In [17]:
X_train_transformed = transformer.fit_transform(X_train)
col_names = transformer.get_feature_names_out()
#X_train_transformed_df = pd.DataFrame(data=X_train_transformed, columns=col_names)

#or

#transformer.fit(X_train)
#then, make X_train_transformed being just transformer.transform()

In [18]:
X_train_transformed = pd.DataFrame(data = transformer.fit_transform(X_train), columns=col_names)

In [19]:
X_train_transformed

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,object__ocean_proximity_<1H OCEAN,object__ocean_proximity_INLAND,object__ocean_proximity_ISLAND,object__ocean_proximity_NEAR BAY,object__ocean_proximity_NEAR OCEAN
0,1.172993,-1.350415,0.428537,1.570557,1.376799,1.081011,1.507507,0.379698,0.0,0.0,0.0,0.0,1.0
1,1.268028,-1.378536,-1.473509,-0.809439,-0.900718,-0.643842,-0.878707,0.420068,0.0,0.0,0.0,0.0,1.0
2,-1.352939,0.988349,-0.046974,1.994289,2.441082,1.363196,2.593828,-0.092320,0.0,0.0,0.0,1.0,0.0
3,-1.127856,0.758691,-0.284730,0.646558,0.230833,0.661262,0.394820,0.682999,1.0,0.0,0.0,0.0,0.0
4,1.793222,-1.083261,-1.632013,-1.117906,-1.181804,-1.203802,-1.255755,-1.255560,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-1.402957,1.082087,1.617317,-0.777706,-0.742156,-0.731143,-0.804879,-1.335305,0.0,0.0,0.0,1.0,0.0
16508,0.592779,-0.816108,0.507789,-0.400173,-0.499510,-0.613860,-0.496385,1.421304,1.0,0.0,0.0,0.0,0.0
16509,0.117604,0.304062,-0.997998,-0.005374,-0.026228,-0.309630,0.052048,-0.911522,0.0,1.0,0.0,0.0,0.0
16510,1.187999,-0.727057,-0.522486,-0.078641,0.041040,0.122465,-0.016506,-0.634382,0.0,1.0,0.0,0.0,0.0


In [20]:
X_test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
20046,-122.38,40.67,10.0,2281.0,444.0,1274.0,438.0,2.2120,INLAND
3024,-118.37,33.83,35.0,1207.0,207.0,601.0,213.0,4.7308,<1H OCEAN
15663,-117.24,32.72,39.0,3089.0,431.0,1175.0,432.0,7.5925,NEAR OCEAN
20484,-118.44,34.05,18.0,4780.0,1192.0,1886.0,1036.0,4.4674,<1H OCEAN
9814,-118.44,34.18,33.0,2127.0,414.0,1056.0,391.0,4.3750,<1H OCEAN
...,...,...,...,...,...,...,...,...,...
15362,-121.92,40.52,13.0,4581.0,881.0,1799.0,734.0,2.2993,INLAND
16623,-122.08,37.68,26.0,2607.0,682.0,1401.0,607.0,2.6563,NEAR BAY
18086,-119.00,35.39,42.0,2839.0,516.0,1203.0,487.0,3.7708,INLAND
2144,-117.92,33.63,39.0,1469.0,226.0,553.0,225.0,7.8496,<1H OCEAN


In [21]:
X_test_transformed = pd.DataFrame(transformer.transform(X_test), columns=col_names)

In [22]:
X_test_transformed

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,object__ocean_proximity_<1H OCEAN,object__ocean_proximity_INLAND,object__ocean_proximity_ISLAND,object__ocean_proximity_NEAR BAY,object__ocean_proximity_NEAR OCEAN
0,-1.407959,2.361612,-1.473509,-0.165440,-0.225631,-0.135910,-0.164161,-0.865909,0.0,1.0,0.0,0.0,0.0
1,0.597781,-0.844229,0.507789,-0.666640,-0.795010,-0.729379,-0.757419,0.454671,1.0,0.0,0.0,0.0,0.0
2,1.162989,-1.364476,0.824797,0.211626,-0.256863,-0.223211,-0.179981,1.955030,0.0,0.0,0.0,0.0,1.0
3,0.562768,-0.741118,-0.839494,1.000758,1.571397,0.403768,1.412586,0.316574,1.0,0.0,0.0,0.0,0.0
4,0.562768,-0.680188,0.349286,-0.237307,-0.297704,-0.328148,-0.288086,0.268129,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-1.177874,2.291308,-1.235754,0.907891,0.824237,0.327049,0.616302,-0.820138,0.0,1.0,0.0,0.0,0.0
4124,-1.257904,0.960228,-0.205478,-0.013307,0.346151,-0.023918,0.281441,-0.632967,0.0,0.0,0.0,1.0,0.0
4125,0.282665,-0.113073,1.062553,0.094959,-0.052655,-0.198520,-0.034963,-0.048646,0.0,1.0,0.0,0.0,0.0
4126,0.822864,-0.937967,0.824797,-0.544373,-0.749364,-0.771707,-0.725778,2.089825,1.0,0.0,0.0,0.0,0.0


In [23]:
from sklearn.ensemble import RandomForestRegressor

In [24]:
rfr = RandomForestRegressor()

In [25]:
rfr.fit(X_train_transformed, y_train)

In [26]:
print(rfr.score(X_train_transformed, y_train))
print(rfr.score(X_test_transformed, y_test))

0.9750373888876412
0.8224281922300049


In [27]:
# Feature Engineering
df['bedroom_ratio'] = df['total_bedrooms'] / df['total_rooms']
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['population_per_household'] = df['population'] / df['households']

In [28]:
!unzip /content/Sleep_Efficiency.csv

unzip:  cannot find or open /content/Sleep_Efficiency.csv, /content/Sleep_Efficiency.csv.zip or /content/Sleep_Efficiency.csv.ZIP.


In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 9), (4128, 9), (16512,), (4128,))

In [47]:
len(X_train.columns), len(X_test.columns)

(9, 9)

In [48]:
num_features = X_train.select_dtypes(include=[np.number]).columns

In [49]:
cat_features = X_train.select_dtypes(include='object').columns

In [50]:
num_pipline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

transformer = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('object', cat_pipeline, cat_features)
], remainder='passthrough')

In [52]:
transformer.fit_transform(X_train)

array([[ 1.17299302, -1.35041487,  0.42853749, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.26802809, -1.37853628, -1.47350948, ...,  0.        ,
         0.        ,  1.        ],
       [-1.3529389 ,  0.98834939, -0.04697426, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.11760365,  0.30406165, -0.99799774, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.18799856, -0.72705686, -0.522486  , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.35269038, -0.66144022, -0.522486  , ...,  0.        ,
         0.        ,  0.        ]])

In [53]:
get_col_names = transformer.get_feature_names_out()
get_col_names

array(['num__longitude', 'num__latitude', 'num__housing_median_age',
       'num__total_rooms', 'num__total_bedrooms', 'num__population',
       'num__households', 'num__median_income',
       'object__ocean_proximity_<1H OCEAN',
       'object__ocean_proximity_INLAND', 'object__ocean_proximity_ISLAND',
       'object__ocean_proximity_NEAR BAY',
       'object__ocean_proximity_NEAR OCEAN'], dtype=object)

In [54]:
X_train_transformed = pd.DataFrame(data=transformer.transform(X_train), columns=get_col_names)

In [55]:
X_train_transformed

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,object__ocean_proximity_<1H OCEAN,object__ocean_proximity_INLAND,object__ocean_proximity_ISLAND,object__ocean_proximity_NEAR BAY,object__ocean_proximity_NEAR OCEAN
0,1.172993,-1.350415,0.428537,1.570557,1.376799,1.081011,1.507507,0.379698,0.0,0.0,0.0,0.0,1.0
1,1.268028,-1.378536,-1.473509,-0.809439,-0.900718,-0.643842,-0.878707,0.420068,0.0,0.0,0.0,0.0,1.0
2,-1.352939,0.988349,-0.046974,1.994289,2.441082,1.363196,2.593828,-0.092320,0.0,0.0,0.0,1.0,0.0
3,-1.127856,0.758691,-0.284730,0.646558,0.230833,0.661262,0.394820,0.682999,1.0,0.0,0.0,0.0,0.0
4,1.793222,-1.083261,-1.632013,-1.117906,-1.181804,-1.203802,-1.255755,-1.255560,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,-1.402957,1.082087,1.617317,-0.777706,-0.742156,-0.731143,-0.804879,-1.335305,0.0,0.0,0.0,1.0,0.0
16508,0.592779,-0.816108,0.507789,-0.400173,-0.499510,-0.613860,-0.496385,1.421304,1.0,0.0,0.0,0.0,0.0
16509,0.117604,0.304062,-0.997998,-0.005374,-0.026228,-0.309630,0.052048,-0.911522,0.0,1.0,0.0,0.0,0.0
16510,1.187999,-0.727057,-0.522486,-0.078641,0.041040,0.122465,-0.016506,-0.634382,0.0,1.0,0.0,0.0,0.0


In [56]:
X_transformed_test = pd.DataFrame(transformer.transform(X_test), columns=get_col_names)

In [57]:
X_transformed_test

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,object__ocean_proximity_<1H OCEAN,object__ocean_proximity_INLAND,object__ocean_proximity_ISLAND,object__ocean_proximity_NEAR BAY,object__ocean_proximity_NEAR OCEAN
0,-1.407959,2.361612,-1.473509,-0.165440,-0.225631,-0.135910,-0.164161,-0.865909,0.0,1.0,0.0,0.0,0.0
1,0.597781,-0.844229,0.507789,-0.666640,-0.795010,-0.729379,-0.757419,0.454671,1.0,0.0,0.0,0.0,0.0
2,1.162989,-1.364476,0.824797,0.211626,-0.256863,-0.223211,-0.179981,1.955030,0.0,0.0,0.0,0.0,1.0
3,0.562768,-0.741118,-0.839494,1.000758,1.571397,0.403768,1.412586,0.316574,1.0,0.0,0.0,0.0,0.0
4,0.562768,-0.680188,0.349286,-0.237307,-0.297704,-0.328148,-0.288086,0.268129,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4123,-1.177874,2.291308,-1.235754,0.907891,0.824237,0.327049,0.616302,-0.820138,0.0,1.0,0.0,0.0,0.0
4124,-1.257904,0.960228,-0.205478,-0.013307,0.346151,-0.023918,0.281441,-0.632967,0.0,0.0,0.0,1.0,0.0
4125,0.282665,-0.113073,1.062553,0.094959,-0.052655,-0.198520,-0.034963,-0.048646,0.0,1.0,0.0,0.0,0.0
4126,0.822864,-0.937967,0.824797,-0.544373,-0.749364,-0.771707,-0.725778,2.089825,1.0,0.0,0.0,0.0,0.0


In [58]:
rfr = RandomForestRegressor()

In [59]:
rfr.fit(X_train_transformed, y_train)

In [60]:
print(rfr.score(X_train_transformed, y_train))
print(rfr.score(X_transformed_test, y_test))

0.9751533479196444
0.8235210812996699


In [67]:
sleep_imputer = SimpleImputer(strategy='median')