## Module imports

In [None]:
import geopandas as gpd
import numpy as np
import sklearn
gpd.options.io_engine = "pyogrio"
%matplotlib inline

### Import data

In [None]:
train_df = gpd.read_file("./data/train.geojson", index_col=0);

## <center> <font color='#0590C0'> Geo data processing <font> <center>

In [None]:
# Tool to compute main angle of a polygon in degree

from shapely import LineString

def segments(curve):
    return list(map(LineString, zip(curve.coords[:-1], curve.coords[1:])))

def main_angle(polygon) :
    curve = polygon.boundary
    seg = segments(curve)
    max_length = 0
    angle = 0
    for s in seg :
        coords = s.coords.xy
        real = coords[0][1] - coords[0][0]
        img = coords[1][1] - coords[1][0]
        z = real + img*1j
        l = np.sqrt(real**2 + img**2)
        if l > max_length :
            max_length = l
            angle = np.angle(z, deg = True)
    return angle % 180

In [None]:
geo = train_df["geometry"];
array = np.asarray(geo)

# Compute bounds and main angle (5 features)
m = list(map(lambda x : list(x.bounds) + [main_angle(x)], array))
geo_data = np.array(m)

In [None]:
# Init a PCA
from sklearn.decomposition import PCA

# Keep 80% of variance
pca = PCA(0.8)
pca.fit(geo_data)

reduced_geo_data = pca.transform(geo_data)
reduced_geo_data.shape

## <center> <font color='#0590C0'> Color data processing <font> <center>

In [None]:
# Selects every column starting with "img"
# We could have separated std deviation and mean ?
cols = [col for col in train_df if col.startswith("img")]

# Build an array from the data
color = np.asarray(train_df[cols])
print(color)
# Delete NaN values
finite_color = color[np.isfinite(color).all(1)]

# Init a PCA
from sklearn.decomposition import PCA

# Keep 80% of variance
pca = PCA(0.8)
pca.fit(finite_color)

# Apply transform
reduced_color = pca.transform(finite_color)
reduced_color.shape