In [None]:
import polars as pl
import numpy as np
#from lmf import db

# Following two imports are required for non-sage environments (python). 
# Comment out if using sage.
from sage.arith.misc import primes_first_n 
from sage.all import libgap


In [None]:
# d : Degree of the Artin representations
# target : One of 'Is_Even', 'Proj', or 'Image'
# target_args : Additional arguments for target
#               For Proj, use GAP id (id1, id2)
#               For Image, use group name (3T2, 4T1, etc.)

def get_data(d, target, target_args=None):
  if target not in ['Is_Even', 'Proj', 'Image']:
    raise ValueError("target must be one of 'Is_Even', 'Proj', or 'Image'")
  data = pl.read_csv(f"artin_reps_d{d}_v2.csv", schema_overrides={"Conductor": pl.Int128})
  
  columns_ = [f'a_{p:03d}' for p in primes_first_n(168)]
  X = data.select(columns_).to_numpy()
  
  if target == 'Is_Even':
    y = data.select("Is_Even").to_numpy().ravel()
  elif target == 'Proj':
    id1, id2 = target_args
    y = data.select((pl.col('GAP_1') == id1) & (pl.col('GAP_2') == id2)).to_numpy().ravel()
  elif target == 'Image':
    y = data.select(pl.col('Image') == target_args).to_numpy().ravel()

  return X, y

In [None]:
def ap_onehot(x, use_existing_values=False):
  # If use_existing_value is true, just use x.unique() as the values used in one-hot encoding
  # If not, find max(abs(x)), and use values from -max, -max+2, -max+3, ..., max-3, max-2, max

  if use_existing_values:
    x_vals = np.sort(np.unique(x))
  else:
    max_val = max(abs(x).max(), abs(x).min())
    x_vals = list(range(-max_val, max_val + 1))
    if max_val != 2:
      x_vals.pop(len(x_vals) - 2) # Remove max_val - 1
      x_vals.pop(1)                # Remove - (max_val - 1)

  x_one_hot = np.zeros((x.shape[0], len(primes_first_n(168)) * len(x_vals)), dtype=int)
  for i, p in enumerate(primes_first_n(168)):
    for j, v in enumerate(x_vals):
      x_one_hot[:, i * len(x_vals) + j] = (x[:, i] == v).astype(int)
  feature_names = [f"a_{p}:{v}" for p in primes_first_n(168) for v in x_vals]
  return x_one_hot, feature_names

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [None]:
def draw_distribution(x, y, labels=None):
  max_val = max(abs(x).max(), abs(x).min())
  
  fig, axes = plt.subplots(3, max_val - 1, figsize=((max_val - 1) * 4, 9))

  # First row : 1, 2, ..., max_val - 2, max_val
  # Second row : 0
  # Third row : -1, -2, ..., -(max_val - 2), -max_val

  x_pos = x[y == 1]
  x_neg = x[y == 0]

  zero_pos_count = (x_pos == 0).sum(axis=1)
  zero_neg_count = (x_neg == 0).sum(axis=1)
  if labels is not None:
    axes[1, 0].hist(zero_pos_count, bins='auto', color='blue', alpha=0.5, density=True, label=labels[0])
    axes[1, 0].hist(zero_neg_count, bins='auto', color='red', alpha=0.5, density=True, label=labels[1])
  else:
    axes[1, 0].hist(zero_pos_count, bins='auto', color='blue', alpha=0.5, density=True)
    axes[1, 0].hist(zero_neg_count, bins='auto', color='red', alpha=0.5, density=True)
  axes[1, 0].set_title('Count of a_p=0')

  for i in range(1, max_val):
    v = i if i != max_val - 1 else max_val

    v_pos_count = (x_pos == v).sum(axis=1)
    v_neg_count = (x_neg == v).sum(axis=1)
    neg_v_pos_count = (x_pos == -v).sum(axis=1)
    neg_v_neg_count = (x_neg == -v).sum(axis=1)
    if labels is not None:
      axes[0, i - 1].hist(v_pos_count, bins='auto', color='blue', alpha=0.5, density=True, label=labels[0])
      axes[0, i - 1].hist(v_neg_count, bins='auto', color='red', alpha=0.5, density=True, label=labels[1])
      axes[2, i - 1].hist(neg_v_pos_count, bins='auto', color='blue', alpha=0.5, density=True, label=labels[0])
      axes[2, i - 1].hist(neg_v_neg_count, bins='auto', color='red', alpha=0.5, density=True, label=labels[1])
    else:
      axes[0, i - 1].hist(v_pos_count, bins='auto', color='blue', alpha=0.5, density=True)
      axes[0, i - 1].hist(v_neg_count, bins='auto', color='red', alpha=0.5, density=True)
      axes[2, i - 1].hist(neg_v_pos_count, bins='auto', color='blue', alpha=0.5, density=True)
      axes[2, i - 1].hist(neg_v_neg_count, bins='auto', color='red', alpha=0.5, density=True)
    
    axes[0, i - 1].set_title(f'Count of a_p={v}')
    axes[2, i - 1].set_title(f'Count of a_p={-v}')
   
  # Legend at the top of the whole figure
  handles, labels = axes[1, 0].get_legend_handles_labels()
  fig.legend(handles, labels, loc='upper center', ncol=2)

In [None]:
x, y = get_data(3, 'Image', target_args='4T5')
x_one_hot, feature_names = ap_onehot(x, use_existing_values=False)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# See analyse_count.ipynb for existing labels
class_names = ['Is 4T5', "Isn't 4T5"]

draw_distribution(x, y, class_names)

In [None]:
# Decision tree with raw x, y

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# plot the tree
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=[f'a_{p:03d}' for p in primes_first_n(168)], class_names=class_names)

In [None]:
# Logistic Regression with raw x, y

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_logreg = log_reg.predict(X_test)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg))

print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

In [None]:
# Print the coefficients, ordered by abs value
coefficients = log_reg.coef_[0]
coef_df = pl.DataFrame({
    'Feature': [f'a_{p:03d}' for p in primes_first_n(168)],
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
}).sort('Abs_Coefficient', descending=True).select(['Feature', 'Coefficient'])

print(coef_df)

In [None]:
# Decision tree with one-hot encoded x, y

x_train, x_test, y_train, y_test = train_test_split(x_one_hot, y, test_size=0.3, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

print("Decision Tree (One-Hot Encoded) Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# plot the tree

plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=feature_names, class_names=class_names)


In [None]:
# Logistic Regression with one-hot encoded x

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(x_train, y_train)

y_pred_logreg = log_reg.predict(x_test)
print("Logistic Regression (One-Hot Encoded) Classification Report:")
print(classification_report(y_test, y_pred_logreg))

print("Logistic Regression (One-Hot Encoded) Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logreg))

In [None]:
# print the coefficients, ordered by abs value

coefficients = log_reg.coef_[0]
coef_df = pl.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
}).sort('Abs_Coefficient', descending=True).select(['Feature', 'Coefficient'])
print(coef_df)