In [1]:
import pandas as pd
import numpy as np

In [2]:
filtered_dataset = pd.read_feather("../data/filtered_dataset.ftr")
filtered_embeddings = np.loadtxt("../data/embeds/filtered_emb_0.txt")

In [37]:
def construct_ngram_features(sequences, n_gram=2):
    n_gram_features = []
    for sequence in sequences:
        n_gram_feature = []
        for i in range(len(sequence) - n_gram + 1):
            n_gram_feature.append("".join(sequence[i:i+n_gram]))
        n_gram_features.append(n_gram_feature)
    return n_gram_features

def construct_ngram_frequency_features(sequences, n_gram=2):
    n_gram_features = construct_ngram_features(sequences, n_gram)
    n_gram_frequency_features = []
    for n_gram_feature in n_gram_features:
        n_gram_frequency_feature = {}
        for feature in n_gram_feature:
            if feature not in n_gram_frequency_feature:
                n_gram_frequency_feature[feature] = 1
            else:
                n_gram_frequency_feature[feature] += 1
        n_gram_frequency_features.append(n_gram_frequency_feature)

    # return np array, each column corresponds to a n_gram feature

    feature_matrix = np.zeros((len(n_gram_frequency_features), 4 ** n_gram))
    ngram_ft_to_idx = {}
    for i in range(len(n_gram_frequency_features)):
        for ngram, freq in n_gram_frequency_features[i].items():
            if ngram not in ngram_ft_to_idx:
                ngram_ft_to_idx[ngram] = len(ngram_ft_to_idx)
            feature_matrix[i, ngram_ft_to_idx[ngram]] = freq
    
    return feature_matrix, n_gram_frequency_features

In [39]:
ngram_ft, d = construct_ngram_frequency_features(filtered_dataset["sequence"].values, 2)
ngram_ft.shape

(47872, 16)

In [53]:
Y = np.array(filtered_dataset.iloc[:, 11:15]).argmax(axis=1)

In [54]:
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


alphas = [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]

ngram_ft, _ = construct_ngram_frequency_features(filtered_dataset["sequence"].values, 1)
#feats = Normalizer().fit_transform(ngram_ft)
feats = Normalizer().fit_transform(ngram_ft)

X_train, X_test, y_train, y_test = train_test_split(ngram_ft, Y, test_size=0.2, random_state=42)

scores = []
for alpha in alphas:
    scores.append(cross_val_score(LogisticRegression(C=alpha, max_iter=1000), X_train, y_train, cv=5, scoring='f1_micro').mean())

alpha = alphas[np.argmax(scores)]

y_test_pred = LogisticRegression(C=alpha, max_iter=1000).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.34      0.50      0.41      2413
           1       0.31      0.11      0.16      2406
           2       0.21      0.09      0.13      2361
           3       0.32      0.55      0.40      2395

    accuracy                           0.31      9575
   macro avg       0.29      0.31      0.27      9575
weighted avg       0.30      0.31      0.27      9575



In [55]:
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


alphas = [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]

ngram_ft, _ = construct_ngram_frequency_features(filtered_dataset["sequence"].values, 2)
#feats = Normalizer().fit_transform(ngram_ft)
feats = Normalizer().fit_transform(ngram_ft)

X_train, X_test, y_train, y_test = train_test_split(ngram_ft, Y, test_size=0.2, random_state=42)

scores = []
for alpha in alphas:
    scores.append(cross_val_score(LogisticRegression(C=alpha, max_iter=1000), X_train, y_train, cv=5, scoring='f1_micro').mean())

alpha = alphas[np.argmax(scores)]

y_test_pred = LogisticRegression(C=alpha, max_iter=1000).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.42      0.43      0.43      2413
           1       0.44      0.50      0.47      2406
           2       0.36      0.26      0.30      2361
           3       0.41      0.45      0.43      2395

    accuracy                           0.41      9575
   macro avg       0.41      0.41      0.41      9575
weighted avg       0.41      0.41      0.41      9575



In [56]:
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


alphas = [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]

ngram_ft, _ = construct_ngram_frequency_features(filtered_dataset["sequence"].values, 3)
#feats = Normalizer().fit_transform(ngram_ft)
feats = Normalizer().fit_transform(ngram_ft)

X_train, X_test, y_train, y_test = train_test_split(ngram_ft, Y, test_size=0.2, random_state=42)

scores = []
for alpha in alphas:
    scores.append(cross_val_score(LogisticRegression(C=alpha, max_iter=1000), X_train, y_train, cv=5, scoring='f1_micro').mean())

alpha = alphas[np.argmax(scores)]

y_test_pred = LogisticRegression(C=alpha, max_iter=1000).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.51      0.52      0.52      2413
           1       0.54      0.55      0.55      2406
           2       0.48      0.45      0.46      2361
           3       0.52      0.53      0.53      2395

    accuracy                           0.51      9575
   macro avg       0.51      0.51      0.51      9575
weighted avg       0.51      0.51      0.51      9575



In [57]:
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


alphas = [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]

ngram_ft, _ = construct_ngram_frequency_features(filtered_dataset["sequence"].values, 4)
#feats = Normalizer().fit_transform(ngram_ft)
feats = Normalizer().fit_transform(ngram_ft)

X_train, X_test, y_train, y_test = train_test_split(ngram_ft, Y, test_size=0.2, random_state=42)

scores = []
for alpha in alphas:
    scores.append(cross_val_score(LogisticRegression(C=alpha, max_iter=1000), X_train, y_train, cv=5, scoring='f1_micro').mean())

alpha = alphas[np.argmax(scores)]

y_test_pred = LogisticRegression(C=alpha, max_iter=1000).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.66      0.68      0.67      2413
           1       0.62      0.62      0.62      2406
           2       0.62      0.62      0.62      2361
           3       0.63      0.61      0.62      2395

    accuracy                           0.63      9575
   macro avg       0.63      0.63      0.63      9575
weighted avg       0.63      0.63      0.63      9575



In [59]:
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score


alphas = [0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000]

#feats = Normalizer().fit_transform(ngram_ft)
feats = Normalizer().fit_transform(filtered_embeddings)

X_train, X_test, y_train, y_test = train_test_split(feats, Y, test_size=0.2, random_state=42)

scores = []
for alpha in alphas:
    scores.append(cross_val_score(LogisticRegression(C=alpha, max_iter=1000), X_train, y_train, cv=5, scoring='f1_micro').mean())

alpha = alphas[np.argmax(scores)]

y_test_pred = LogisticRegression(C=alpha, max_iter=1000).fit(X_train, y_train).predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.52      0.50      0.51      2413
           1       0.54      0.55      0.55      2406
           2       0.53      0.53      0.53      2361
           3       0.52      0.53      0.52      2395

    accuracy                           0.53      9575
   macro avg       0.53      0.53      0.53      9575
weighted avg       0.53      0.53      0.53      9575



In [3]:
import pickle as pkl
biosample_type_map = pkl.load(open("../data/biosample_type_map.pkl", "rb"))

In [4]:
cells = filtered_dataset.columns[11:15]
is_cancer = np.array([biosample_type_map[cell] == "Cancer" for cell in cells])
Y = filtered_dataset.iloc[:, 11:15]

is_cancer_cells = Y.dot(is_cancer)

In [5]:
is_cancer_cells.sum()

23936

In [6]:
from sklearn.linear_model import LogisticRegression

In [9]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

Y = np.array(Y).argmax(axis=1)

X_train, X_test, y_train, y_test = train_test_split(filtered_embeddings, Y, test_size=0.15, random_state=42)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:

model = LogisticRegression(max_iter=1000, C=0.1, penalty="l1", solver="liblinear")
model.fit(X_train, y_train)

In [13]:
y_test_pred = model.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.53      0.51      0.52      1837
           1       0.54      0.58      0.56      1765
           2       0.52      0.49      0.51      1780
           3       0.52      0.51      0.51      1799

    accuracy                           0.52      7181
   macro avg       0.52      0.52      0.52      7181
weighted avg       0.52      0.52      0.52      7181



In [10]:
idx = np.arange(len(filtered_embeddings))
np.random.shuffle(idx)