# Simple models

In [1]:
import os.path
import yaml

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import sleep_models
import sleep_models.models.torch.utils.data as data_utils
from sleep_models.models.torch.variables import ModelProperties

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [5]:
BACKGROUND = "glia"
CLUSTER = "EGN++"
MODEL_PROPERTIES = ModelProperties(target="Condition", task="CLASSIFIER", encoding="ONE_HOT")
SEED = 1000
FRACTION = 1.0
TRIM=False

In [6]:
def load_data(background, cluster, model_properties, seed, fraction, trim=False):
    h5ad_input = f"../results/{BACKGROUND}-data/{BACKGROUND}-no-marker-genes.h5ad"
    output_root = f"../results/{BACKGROUND}-models/RF/"

    output = os.path.join(output_root, f"random_state_{seed}_fraction_{fraction}")
    label_mapping_file = "../data/templates/simple_condition_mapping.yaml"

    with open(label_mapping_file, "r") as filehandle:
        label_mapping = yaml.load(filehandle, yaml.SafeLoader)
    
    data = data_utils.load_data(
        h5ad_input,
        output = output,
        cluster=cluster,
        seed=seed,
        highly_variable_genes=False,
        model_properties = MODEL_PROPERTIES,
        label_mapping = label_mapping,
        trim=trim,
    )

    return data

In [7]:
data = load_data(BACKGROUND, CLUSTER, MODEL_PROPERTIES, SEED, FRACTION, trim=TRIM)
encoding = data["encoding"]

  dispersion = np.log(dispersion)


y has 4 columns


In [8]:
encoding

{0: 'SD', 1: 'SD++', 2: 'drug', 3: 'sleep'}

In [9]:
data["datasets"][2].std(0).shape

(952,)

In [10]:
def reencode_y(X_train, y_train, X_test, y_test):
    y_train = y_train[:,:-1]
    y_test = y_test[:,:-1]
    return (X_train, y_train, X_test, y_test)
    
    

In [11]:
def random_forest_pipeline(X_train, y_train, X_test, y_test):
    random_forest = RandomForestClassifier(class_weight="balanced", n_estimators=X_train.shape[1], n_jobs=10)
    random_forest.fit(X_train, y_train)
    _, weights = random_forest._validate_y_class_weight(y_train)
    
    test_accuracy = random_forest.score(X_test, y_test)
    print(f"Test accuracy: {test_accuracy:.3f}")
    predictions = random_forest.predict(X_test).argmax(1)
    return predictions

In [12]:
def svm_pipeline(X_train, y_train, X_test, y_test):
    
    
    # https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use
    # Make scale invariant
    scale = X_train.std(axis=0)
    X_train /= scale
    X_test /= scale
    # increase RAM to 2GB
    CACHE_SIZE=2000
    # increase regularization
    C=.001

    svm = SVC(decision_function_shape='ovo', cache_size=CACHE_SIZE, C=C)
    svm.fit(X_train, y_train.argmax(1))
    test_accuracy = svm.score(X_test, y_test.argmax(1))
    print(f"Test accuracy: {test_accuracy:.3f}")
    predictions = svm.predict(X_test)
    return predictions
    

In [13]:
def nearest_neighbors_pipeline(X_train, y_train, X_test, y_test):
    
    nearest_neighbors = KNeighborsClassifier()
    nearest_neighbors.fit(X_train, y_train.argmax(1))
    
    test_accuracy = nearest_neighbors.score(X_test, y_test.argmax(1))
    print(f"Test accuracy: {test_accuracy:.3f}")
    predictions = nearest_neighbors.predict(X_test)
    return predictions
      

In [168]:
# datasets=reencode_y(*data["datasets"])

In [130]:
predictions = random_forest_pipeline(*data["datasets"])

Test accuracy: 0.402


array([3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 0, 0, 3, 0, 3, 3, 3, 3, 3, 0, 0,
       3, 0, 0, 3, 3, 3, 3, 0, 0, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3, 0,
       3, 3, 3, 0, 3, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 3, 3, 3, 3,
       0, 3, 3, 0, 3, 3, 3, 0, 3, 3, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 3, 3, 3, 0, 0, 3])

In [172]:
predictions = svm_pipeline(*data["datasets"])

Test accuracy: 0.504


array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3])

In [14]:
predictions = nearest_neighbors_pipeline(*data["datasets"])

Test accuracy: 0.444


array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3,
       3, 3, 1, 3, 3, 1, 3, 1, 3, 0, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 1, 1, 3, 3, 3, 3, 3, 0,
       3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1, 0, 3,
       3, 3, 3, 1, 3, 0, 3])

In [175]:
encoding

{0: 'SD', 1: 'SD++', 2: 'drug', 3: 'sleep'}