In [1]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE99nnn/GSE99039/matrix/GSE99039_series_matrix.txt.gz

--2026-02-04 08:49:15--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE99nnn/GSE99039/matrix/GSE99039_series_matrix.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.31, 130.14.250.7, 130.14.250.10, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.31|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63827685 (61M) [application/x-gzip]
Saving to: ‘GSE99039_series_matrix.txt.gz’


2026-02-04 08:49:24 (8.28 MB/s) - ‘GSE99039_series_matrix.txt.gz’ saved [63827685/63827685]



In [26]:
import pandas as pd
import numpy as np
import gzip
from io import StringIO


In [27]:
file_path = "/workspaces/Reasearch_Genomic/GSE99039_series_matrix.txt.gz"

with gzip.open(file_path, "rt") as f:
    lines = f.readlines()


In [28]:
start = lines.index("!series_matrix_table_begin\n") + 1
end = lines.index("!series_matrix_table_end\n")

data = "".join(lines[start:end])
df = pd.read_csv(StringIO(data), sep="\t")


In [29]:
df = df.set_index(df.columns[0])  # Gene IDs
X = df.T                          # samples as rows


In [30]:
X.shape


(558, 54675)

In [31]:
labels = []
for line in lines:
    if line.startswith("!Sample_characteristics_ch1"):
        labels = line.strip().split("\t")[1:]


In [32]:
print("Number of samples:", X.shape[0])
print("Number of labels:", len(labels))
print("Unique labels:")
print(set(labels))


Number of samples: 558
Number of labels: 558
Unique labels:
{'""', '"moca score: 0"'}


In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [34]:
y_proxy = X_scaled.mean(axis=1)


In [35]:
from sklearn.linear_model import Lasso

lasso = Lasso(
    alpha=0.001,      # small alpha → avoids all-zero coefficients
    max_iter=10000
)

lasso.fit(X_scaled, y_proxy)


0,1,2
,"alpha  alpha: float, default=1.0 Constant that multiplies the L1 term, controlling regularization strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. When `alpha = 0`, the objective is equivalent to ordinary least squares, solved by the :class:`LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Lasso` object is not advised. Instead, you should use the :class:`LinearRegression` object.",0.001
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"precompute  precompute: bool or array-like of shape (n_features, n_features), default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity.",False
,"copy_X  copy_X: bool, default=True If ``True``, X will be copied; else, it may be overwritten.",True
,"max_iter  max_iter: int, default=1000 The maximum number of iterations.",10000
,"tol  tol: float, default=1e-4 The tolerance for the optimization: if the updates are smaller or equal to ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller or equal to ``tol``, see Notes below.",0.0001
,"warm_start  warm_start: bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `.",False
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive.",False
,"random_state  random_state: int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.",
,"selection  selection: {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4.",'cyclic'


In [36]:
coef_df = pd.DataFrame({
    "Gene": X.columns,
    "Coefficient": lasso.coef_
})


In [37]:
coef_df = coef_df[coef_df["Coefficient"] != 0]


In [38]:
top32 = coef_df.reindex(
    coef_df.Coefficient.abs().sort_values(ascending=False).index
).head(32)

top32


Unnamed: 0,Gene,Coefficient
54620,AFFX-CreX-3_at,-0.011403
35272,226016_at,-0.006884
54628,AFFX-hum_alu_at,0.004393
15214,205767_at,0.004125
12311,202863_at,-0.004046
24354,215060_at,0.003882
31774,222495_at,-0.00354
15877,206431_x_at,0.003163
50036,240787_at,0.002742
6242,1561786_at,0.002481


In [39]:
top32.to_csv("GSE99039_Top32_Lasso_Genes.csv", index=False)


do the ann and accuracy and loss garph
