In [1]:
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE99nnn/GSE99039/matrix/GSE99039_series_matrix.txt.gz

--2026-02-06 08:14:52--  https://ftp.ncbi.nlm.nih.gov/geo/series/GSE99nnn/GSE99039/matrix/GSE99039_series_matrix.txt.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.7, 130.14.250.10, 130.14.250.11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63827685 (61M) [application/x-gzip]
Saving to: ‘GSE99039_series_matrix.txt.gz.1’


2026-02-06 08:15:00 (9.17 MB/s) - ‘GSE99039_series_matrix.txt.gz.1’ saved [63827685/63827685]



In [2]:
import pandas as pd
import numpy as np
import gzip
from io import StringIO


In [3]:
def load_geo_series_matrix(path):
    with gzip.open(path, 'rt') as f:
        lines = f.readlines()

    start, end = None, None
    for i, line in enumerate(lines):
        if line.startswith("!series_matrix_table_begin"):
            start = i + 1
        elif line.startswith("!series_matrix_table_end"):
            end = i
            break

    if start is None or end is None:
        raise ValueError("Series matrix table not found")

    data = ''.join(lines[start:end])
    df = pd.read_csv(StringIO(data), sep="\t", index_col=0)
    return df


In [4]:
path = "/workspaces/Reasearch_Genomic/GSE99039_series_matrix.txt.gz"

df = load_geo_series_matrix(path)
print("Original shape:", df.shape)


Original shape: (54675, 558)


In [5]:
X = df.T
print("After transpose:", X.shape)


After transpose: (558, 54675)


In [6]:
y = np.zeros(X.shape[0])
y[: X.shape[0] // 2] = 1


In [7]:
X = X.apply(pd.to_numeric, errors="coerce")
X = X.fillna(0)

print(X.dtypes.unique())


[dtype('float64')]


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.linear_model import LogisticRegression

lasso = LogisticRegression(
    penalty="l1",
    solver="saga",
    max_iter=5000,
    C=1.0
)

lasso.fit(X_scaled, y)




In [None]:
coef = lasso.coef_[0]
importance = np.abs(coef)
genes = X.columns.to_numpy()

lasso_df = pd.DataFrame({
    "Gene": genes,
    "Coefficient": coef,
    "Importance": importance
})

top_32 = (
    lasso_df[lasso_df["Importance"] > 0]
    .sort_values(by="Importance", ascending=False)
    .head(32)
)

top_32


In [None]:
top_32.to_csv("GSE99039_Top32_LASSO.csv", index=False)


In [None]:
selected_genes = top_32["Gene"].values
X_lasso = X[selected_genes]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_lasso,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


In [None]:
model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    Dropout(0.3),

    Dense(32, activation="relu"),
    Dropout(0.3),

    Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=16,
    validation_data=(X_val, y_val),
    verbose=1
)


In [None]:
loss, accuracy = model.evaluate(X_val, y_val, verbose=0)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Loss: {loss:.4f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("ANN Loss Curve (GSE99039)")
plt.legend()
plt.show()


In [None]:
plt.figure()
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("ANN Accuracy Curve (GSE99039)")
plt.legend()
plt.show()
