# Initialisation et import des librairies

In [1]:
import numpy    
from sklearn.preprocessing import LabelEncoder
import GEOparse
import pandas
from sklearn.model_selection import train_test_split

import os
if not os.path.isfile("GSE13204_family.soft.gz"):
  !wget -O "GSE13204_family.soft.gz" "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE13nnn/GSE13204/soft/GSE13204_family.soft.gz" 

# Traitement des données

In [2]:
# Soft file

gse = GEOparse.get_GEO(filepath="GSE13204_family.soft.gz", silent = True)

  return parse_GSE(filepath)


In [3]:
gsms = gse.gsms
print(list(gsms.keys())[:10])

['GSM329407', 'GSM329408', 'GSM329409', 'GSM329410', 'GSM329411', 'GSM329412', 'GSM329413', 'GSM329414', 'GSM329415', 'GSM329416']


In [4]:
print(gsms['GSM329411'].table)
print(gsms['GSM329407'].columns)
print(gsms['GSM329407'].metadata)

               ID_REF    VALUE_DS ABS_CALL  DETECTION P-VALUE     VALUE
0      AFFX-BioB-5_at   5614.9300        P           0.000127       NaN
1      AFFX-BioB-M_at   9707.9000        P           0.000044       NaN
2      AFFX-BioB-3_at   5331.4200        P           0.000044       NaN
3      AFFX-BioC-5_at  17928.6000        P           0.000044       NaN
4      AFFX-BioC-3_at  18722.0000        P           0.000044       NaN
...               ...         ...      ...                ...       ...
54670      1570644_at     57.2892        A           0.870361  0.058415
54671      1570645_at    127.5280        A           0.398926  0.158527
54672      1570650_at    214.8240        A           0.601074  0.247337
54673      1570651_at    488.8230        P           0.046143  0.416758
54674      1570653_at    122.2930        A           0.466064  0.152232

[54675 rows x 5 columns]
                                                         description
ID_REF                                   

In [5]:
# Label
lise = []
alignments = {}
for i in gsms.keys():
    lab = gsms[i].metadata['characteristics_ch1'][1]
    lise.append(lab)
    alignments[i] = lab 
for i in list(set(lise)):
    print(i)

leukemia class: Pro-B-ALL with t(11q23)/MLL
leukemia class: AML with inv(16)/t(16;16)
leukemia class: AML with t(11q23)/MLL
leukemia class: AML complex aberrant karyotype
leukemia class: c-ALL/Pre-B-ALL with t(9;22)
leukemia class: ALL with t(12;21)
leukemia class: T-ALL
leukemia class: ALL with t(1;19)
leukemia class: CLL
leukemia class: CML
leukemia class: mature B-ALL with t(8;14)
leukemia class: MDS
leukemia class: AML with t(8;21)
leukemia class: Non-leukemia and healthy bone marrow
leukemia class: c-ALL/Pre-B-ALL without t(9;22)
leukemia class: AML with normal karyotype + other abnormalities
leukemia class: AML with t(15;17)
leukemia class: ALL with hyperdiploid karyotype


In [6]:
# Assigning CEL files with their labels

pandas.set_option('display.max_rows', 10)
print("Number of patients : "+str(len(gsms.keys())))
gpls = gse.gpls
print(gpls.keys())
print(gpls['GPL570'].columns)
print(gpls['GPL570'].table)
#print(gpls.table)
count_list = []
print("Count of genes :")
df = gsms['GSM329411'].table.dropna()
print(len(df))

equivalents = {
"leukemia class: ALL with t(12;21)":0,  # ALL
"leukemia class: c-ALL/Pre-B-ALL with t(9;22)":0, # ALL
"leukemia class: AML with normal karyotype + other abnormalities":1, # AML
"leukemia class: ALL with t(1;19)":0, # ALL
"leukemia class: CLL":2, # CLL
"leukemia class: AML with t(15;17)":1, # AML
"leukemia class: Non-leukemia and healthy bone marrow":5, # Non-Leukemia
"leukemia class: Pro-B-ALL with t(11q23)/MLL":0, # ALL
"leukemia class: ALL with hyperdiploid karyotype":0, # ALL
"leukemia class: T-ALL":0, # ALL
"leukemia class: AML with t(8;21)":1, # AML
"leukemia class: AML with inv(16)/t(16;16)":1, # AML
"leukemia class: MDS":4, # MDS
"leukemia class: mature B-ALL with t(8;14)":0, # ALL
"leukemia class: c-ALL/Pre-B-ALL without t(9;22)":0, # ALL
"leukemia class: AML with t(11q23)/MLL":1, # AML
"leukemia class: CML":3, # CML
"leukemia class: AML complex aberrant karyotype":1, # AML
}
import numpy

X = []
Y = []
count54 = 0
count14 = 0
all_labels = []
for i in gsms:
  j = gsms[i].table.dropna()
  if len(j["VALUE_DS"]) == 54630: 
    Y.append(numpy.array(equivalents[gsms[i].metadata['characteristics_ch1'][1]]))
    all_labels.append(int(equivalents[gsms[i].metadata['characteristics_ch1'][1]]))
    r = j["VALUE_DS"].to_numpy()
    X.append(r)
    count54 += 1
  else:
    count14 +=1


print("STAGE 1 DATA :"+str(count54))   
print("STAGE 2 DATA :"+str(count14))
 

Number of patients : 3248
dict_keys(['GPL570', 'GPL7473'])
                                                                        description
ID                                Affymetrix Probe Set ID LINK_PRE:"https://www....
GB_ACC                            GenBank Accession Number LINK_PRE:"http://www....
SPOT_ID                                                         identifies controls
Species Scientific Name           The genus and species of the organism represen...
Annotation Date                   The date that the annotations for this probe a...
...                                                                             ...
ENTREZ_GENE_ID                    Entrez Gene Database UID LINK_PRE:"http://www....
RefSeq Transcript ID              References to multiple sequences in RefSeq. Th...
Gene Ontology Biological Process  Gene Ontology Consortium Biological Process de...
Gene Ontology Cellular Component  Gene Ontology Consortium Cellular Component de...
Gene Ontology Mol

# Creation des models et apprentissage



In [7]:
Y_step = numpy.copy(Y)
Y_step2 = numpy.array(Y_step).reshape(-1, 1)
from sklearn.utils.class_weight import compute_class_weight
classWeight = compute_class_weight('balanced', [0,1,2,3,4,5], all_labels) 
classWeight = dict(enumerate(classWeight))

Y_step2 = numpy.array(Y_step2)
X_step = numpy.array(X)
print("X[0] shape : "+str(X[0].shape))
train_x, test_x, train_y, test_y = train_test_split(X_step, Y_step2, test_size=0.33)
import pickle
with open('train_x.pickle', 'wb') as handle:
    pickle.dump(train_x, handle, protocol=2)
with open('train_y.pickle', 'wb') as handle2:
    pickle.dump(train_y, handle2, protocol=2)
with open('test_x.pickle', 'wb') as handle3:
    pickle.dump(test_x, handle3, protocol=2)
with open('test_y.pickle', 'wb') as handle4:
    pickle.dump(test_y, handle4, protocol=2)


X[0] shape : (54630,)
