In [48]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt

In [73]:
california_data = fetch_california_housing(as_frame=True)
data = california_data.frame

In [74]:
# Step 2: Define the coordinates of the major cities
cities = {
    "San Diego": (32.7157, -117.1611),
    "San Francisco": (37.7749, -122.4194),
    "Los Angeles": (34.0522, -118.2437)
}

In [75]:
#define haversine formula to calculate distances
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points
    on the Earth's surface using the Haversine formula.
    Inputs are in degrees, output is in kilometers.
    """
    R = 6371.0  # Earth radius in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

In [80]:
# Step 4: Add a function to calculate the minimum distance to the nearest city
def calculate_min_distance(row):
    lat, lon = row['Latitude'], row['Longitude']
    distances = [haversine(lat, lon, lat_city, lon_city) for lat_city, lon_city in cities.values()]
    return min(distances)

In [77]:
# Step 5: Apply the function to the dataset
data['min_distance_to_city'] = data.apply(calculate_min_distance, axis=1)

In [78]:
# Step 6: Save the enhanced dataset as a CSV file
data.to_csv('california_housing_with_distances.csv', index=False)

In [79]:
# Step 7: Print a preview of the dataset
#print(data.head(20))

# Print rows 20 to 40 (row indices 20 to 39)
print(data.iloc[2000:2040])

      MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
2000  2.5045      15.0  4.103933   1.165730       924.0  2.595506     36.74   
2001  1.2375      35.0  4.459916   1.067511      2050.0  4.324895     36.75   
2002  1.2813      31.0  3.627907   0.979328      1515.0  3.914729     36.75   
2003  1.0513      35.0  2.951557   1.024221      1228.0  4.249135     36.74   
2004  2.1094      52.0  2.059524   1.035714       401.0  4.773810     36.74   
2005  0.7990      25.0  3.645435   1.150743      1343.0  2.851380     36.74   
2006  1.3527      41.0  3.711409   1.042506      1391.0  3.111857     36.75   
2007  1.1230      33.0  3.736407   1.104019      3530.0  4.172577     36.75   
2008  1.8967      52.0  5.000000   1.049180       406.0  3.327869     36.74   
2009  1.6435      25.0  5.070727   1.037328      1642.0  3.225933     36.72   
2010  2.6176      17.0  5.361345   1.016807       927.0  3.894958     36.72   
2011  2.4137      21.0  5.386076   1.132911      134

In [56]:
# Define target column (y) and feature columns (X)
target_column = "MedHouseVal"  # This is the target column
X = data.drop(columns=[target_column])  # Drop the target column to get features
y = data[target_column]  # Extract the target column

In [66]:
#Define the pipeline
pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", KNeighborsRegressor())
])

In [67]:
#use GridSearchCV
mod = GridSearchCV(estimator=pipe,
    param_grid={'model__n_neighbors':[1,2,3,4,5]},
    cv = 3)

In [68]:
mod.fit(X, y)

In [69]:
pd.DataFrame(mod.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.021873,0.007462,0.432273,0.060939,1,{'model__n_neighbors': 1},0.168591,0.360018,0.295997,0.274869,0.079565,5
1,0.016436,0.000113,0.476765,0.057837,2,{'model__n_neighbors': 2},0.316701,0.506395,0.376559,0.399885,0.079179,4
2,0.015427,0.000178,0.418895,0.061979,3,{'model__n_neighbors': 3},0.376404,0.543473,0.40607,0.441982,0.07278,3
3,0.016254,0.000932,0.474143,0.077947,4,{'model__n_neighbors': 4},0.406501,0.568286,0.425274,0.466687,0.072249,2
4,0.016259,0.000178,0.557001,0.064574,5,{'model__n_neighbors': 5},0.420102,0.579128,0.434854,0.478028,0.071742,1
