In [None]:
import pathlib

import polars as pl
from tqdm import tqdm

from lmf import db

av_db = db.av_fq_isog


In [None]:
def query_av(q, g, limit=None):
    cols = ["label", "poly", "p_rank", "is_simple", "has_principal_polarization", "has_jacobian"]
    search_data = av_db.search({"q": q, "g": g}, cols, limit=limit)
    result = []
    for r in tqdm(search_data):
        label = r["label"]
        poly = r["poly"]
        # a0, a1, a2, a3, a4 = poly
        p_rank = r["p_rank"]
        is_simple = r["is_simple"]
        has_principal_polarization = r["has_principal_polarization"]
        has_jacobian = r["has_jacobian"]
        row = [("label", label)]
        for i in range(2*g+1):
            row.append((f"a{i}", poly[i]))
        row += [("p_rank", p_rank), ("is_simple", is_simple), ("has_principal_polarization", has_principal_polarization), ("has_jacobian", has_jacobian)]
        rdict = dict()
        for (a, b) in row:
            rdict[a] = b
        #result.append(rdict)
        '''
        if p_rank == 0:
            continue
        elif p_rank == 1:
            result.append(rdict)
        else:
            if random.random() < 1/q:
                result.append(rdict)
            else:
                continue
        '''
        # *** Filter out some points for accurate results *** 
        if poly[2] > 0:
            result.append(rdict)
    schema = [("label", pl.String)]
    for i in range(2*g+1):
        schema.append((f"a{i}", pl.Int64))
    schema += [("p_rank", pl.Int64), ("is_simple", pl.Int64), ("has_principal_polarization", pl.Int64), ("has_jacobian", pl.Int64)]
    return pl.DataFrame(result, schema=schema)

def X_y(df, label, numc):
    """
    Given a polars dataframe df, return the feature matrix X and target vector y.
    The features are the first num_ap coefficients a_p of the L-series.
    The target is the column specified by label.
    """
    columns_ = [f"a{k}" for k in range(numc)]
    X = df.select(columns_)
    y = df.select(label)
    return X, y

In [None]:
data = query_av(q=17, g=2, limit=None)

print(data.columns)

In [None]:
from itertools import combinations_with_replacement
x = data.filter(pl.col('has_jacobian') == -1).select(['a1', 'a2'])
labels = []
def poly_feature(df, deg):
  # Given a polars dataframe x, return a new dataframe with polynomial features up to degree deg.
  cols = df.columns
  result = dict()
  for d in range(1, deg+1):
      for comb in combinations_with_replacement(cols, d):
          name = "_".join(comb)
          labels.append(name)
          items = [df.select(item).to_numpy() for item in comb]
          # multiply the items elementwise
          result[name] = items[0]
          for item in items[1:]:
              result[name] = result[name] * item
  return pl.DataFrame(result)

X_poly = poly_feature(x, deg=2)
print(X_poly.columns)

In [None]:
# Apply PCA to the data, and see the components
from sklearn.decomposition import PCA

pca = PCA(n_components=len(X_poly.columns))
X_pca = pca.fit_transform(X_poly.to_numpy())

print("Explained variance ratio:", pca.explained_variance_ratio_)
#print("Components:", pca.components_)

import numpy as np
import matplotlib.pyplot as plt
 
# 1. Histogram of last PCA component
plt.figure(figsize=(10, 6))
plt.hist(X_pca[:, -1], bins=30, alpha=0.7, color='blue', density=True)
plt.title('Histogram of Last PCA Component')
plt.xlabel('Value')
plt.ylabel('Density')
plt.show()

# print the symbolic expressions of the last PCA component
print("Last PCA component expression:")
# As ~~~ + ~~~ + ~~~ + ... + ~~~ = C (Constant)
print(" + ".join([f"{coef:.4f} * {X_poly.columns[i]}" for i, coef in enumerate(pca.components_[-1])]) + " = C")