In [1]:
import pathlib
from glob import glob

import geopandas as gpd
import joblib
import numpy as np
import pandas as pd

from gwlearn.ensemble import GWRandomForestClassifier
from gwlearn.linear_model import GWLogisticRegression
from gwlearn.search import BandwidthSearch

In [2]:
import gwlearn

gwlearn.__version__

'0.1.dev69+gd54e68e.d20250703'

In [3]:
storage_options = {
    "client_kwargs": {
        "endpoint_url": "https://s3.cl4.du.cesnet.cz",
    },
    "profile": "uscuni"
}


pcas = gpd.read_parquet("s3://uscuni-restricted/05_pcs/pcs_29.parquet", storage_options=storage_options)
clusters = pd.read_csv(
    "s3://uscuni-restricted/04_spatial_census/cluster_assignment_v10.csv",
    dtype={"kod_nadzsj_d": str},
    storage_options=storage_options
)
cluster_mapping = pd.read_parquet(
    "s3://uscuni-ulce/processed_data/clusters/cluster_mapping_v10.pq", storage_options=storage_options
)
data = pcas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])


In [4]:
data_sample = data.sample(5000)

mapped = data_sample["final_without_noise"].map(cluster_mapping[3])
mapped.value_counts()

final_without_noise
4    2145
5    1013
3     544
7     406
1     361
8     326
2     119
6      86
Name: count, dtype: int64

In [5]:
y = mapped == 6
y.sum() / y.shape[0]

np.float64(0.0172)

In [14]:
search = BandwidthSearch(
    GWRandomForestClassifier,
    geometry=data_sample.representative_point(),
    fixed=False,
    n_jobs=-1,
    search_method="interval",
    min_bandwidth=100,
    max_bandwidth=400,
    interval=50,
    criterion="aicc",
    metrics=['oob_score', 'prediction_rate'],
    verbose=2,
    # batch_size=1000,
    min_proportion=0.1,
    class_weight="balanced",
    undersample=True,
)
search.fit(
    data_sample[variables],
    y,

)

0.00s: Building weights
0.22s: Weights ready
0.22s: Fitting the models
10.92s: Models fitted
11.04s: Measuring focal performance
11.04s: Computing global likelihood
11.04s: Computing information criteria
11.05s: Measuring pooled performance
11.05s: Measuring local pooled performance
11.07s: Finished
Bandwidth: 100.00, aicc: nan
0.00s: Building weights
0.30s: Weights ready
0.30s: Fitting the models
7.36s: Models fitted
7.47s: Measuring focal performance
7.47s: Computing global likelihood
7.47s: Computing information criteria
7.48s: Measuring pooled performance
7.48s: Measuring local pooled performance
7.50s: Finished
Bandwidth: 150.00, aicc: nan
0.00s: Building weights
0.38s: Weights ready
0.38s: Fitting the models
5.82s: Models fitted
5.93s: Measuring focal performance
5.93s: Computing global likelihood
5.93s: Computing information criteria
5.93s: Measuring pooled performance
5.94s: Measuring local pooled performance
5.96s: Finished
Bandwidth: 200.00, aicc: nan
0.00s: Building weights


  ).fit(X=X, y=y)


0.85s: Weights ready
0.85s: Fitting the models
2.98s: Models fitted
3.09s: Measuring focal performance
3.09s: Computing global likelihood
3.09s: Computing information criteria
3.10s: Measuring pooled performance
Bandwidth: 350.00, aicc: nan
0.00s: Building weights


  ).fit(X=X, y=y)


0.99s: Weights ready
0.99s: Fitting the models
3.51s: Models fitted
3.62s: Measuring focal performance
3.62s: Computing global likelihood
3.62s: Computing information criteria
3.63s: Measuring pooled performance
Bandwidth: 400.00, aicc: nan


  ).fit(X=X, y=y)
  self.optimal_bandwidth_ = self.scores_.idxmin()


<gwlearn.search.BandwidthSearch at 0x176827b10>

In [15]:
search.metrics_

Unnamed: 0,oob_score,prediction_rate
100,0.818542,0.0586
150,0.830415,0.0612
200,0.867398,0.0464
250,0.875467,0.0484
300,,
350,,
400,,


In [16]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier

In [17]:
rus = RandomUnderSampler()
data_us, y_us = rus.fit_resample(    data_sample[variables],
    y,)

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    data_us, y_us, test_size=0.33, random_state=42)


In [24]:
gm = RandomForestClassifier(n_jobs=-1, oob_score=True).fit(data_us, y_us)

In [25]:
gm.oob_score_

0.8255813953488372

In [57]:
from sklearn.metrics import accuracy_score

In [58]:
accuracy_score(y_test, gm.predict(X_test))

0.6431254695717505

In [73]:
l = []

In [74]:
if l:
    print(1)