In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import LeaveOneOut, GridSearchCV, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif, RFE, SelectKBest, f_classif
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

### Lectura de Datos

In [2]:
# Carga de datos
DATA_PATH = '/data/sespinoza/objetivo2/'
data = pd.read_csv(DATA_PATH + 'normalized_expression_mapped_peru.csv', header=0, index_col=0, sep=',', encoding='utf-8')

### Manejo de Nulos y Duplicados

In [3]:
print(data.isnull().sum().sum())

0


In [4]:
#borra los nulos
data = data.dropna()

In [5]:
data.head()

Unnamed: 0,GSM4043276_HCC004_HTA_2_0.CEL,GSM4043277_HCC011_HTA_2_0.CEL,GSM4043278_HCC013_HTA_2_0.CEL,GSM4043279_HCC015_HTA_2_0.CEL,GSM4043280_HCC016_HTA_2_0.CEL,GSM4043281_HCC025_HTA_2_0.CEL,GSM4043282_HCC026_HTA_2_0.CEL,GSM4043283_HCC034_HTA_2_0.CEL,GSM4043284_HCC047_HTA_2_0.CEL,GSM4043285_HCC054_HTA_2_0.CEL,...,GSM4043336_NTL121_HTA_2_0.CEL,GSM4043337_NTL173_HTA_2_0.CEL,GSM4043338_NTL188_HTA_2_0.CEL,GSM4043339_NTL190_HTA_2_0.CEL,GSM4043340_NTL191_HTA_2_0.CEL,GSM4043341_NTL192_HTA_2_0.CEL,GSM4043342_NTL193_HTA_2_0.CEL,GSM4043343_NTL200_HTA_2_0.CEL,GSM4043344_NTL201_HTA_2_0.CEL,GeneSymbol
TC01000001.hg.1,5.556382,5.385476,5.582431,5.321239,5.58015,5.090377,5.543542,5.815775,5.781128,5.462648,...,5.747558,5.685633,5.678107,5.479305,5.159973,5.572072,5.586533,5.509569,5.263971,DDX11L1
TC01000003.hg.1,3.522511,3.011026,3.754522,3.216142,3.343459,3.151042,3.232807,3.496659,3.271508,3.475425,...,3.641678,3.554388,3.807339,3.292701,3.069621,3.794532,3.673249,3.42312,3.46347,OR4F5
TC01000005.hg.1,8.388213,8.461633,8.107028,8.165957,8.615982,8.331395,8.491633,8.543905,8.646759,8.20083,...,8.313295,8.007746,8.12747,8.079236,8.439039,8.140706,8.413719,8.269379,8.615382,LINC01001
TC01000007.hg.1,11.313075,11.423049,11.083901,11.134547,11.38856,11.209667,11.24979,11.319392,11.483748,11.555652,...,11.50085,10.496035,11.170733,10.906176,11.196305,11.003916,11.591072,11.292875,11.805233,LINC01061
TC01000009.hg.1,3.510059,3.054648,3.632833,3.064161,3.315992,2.796682,3.583861,2.924338,3.181977,2.668511,...,3.06149,3.128445,3.478542,3.024672,2.822522,3.299731,2.986613,3.001979,3.001183,OR4F29


In [6]:
data = data.drop_duplicates(subset='GeneSymbol', keep='first')

### Generar etiqueta a predecir

In [7]:
data.set_index('GeneSymbol', inplace=True)

In [8]:
labels_ma = ['cancer' if 'HCC' in col else 'healthy' if 'NTL' in col else 'unknown' for col in data.columns]
print(labels_ma)

['cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'cancer', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy']


In [9]:
data = data.T
# Añadir las etiquetas al DataFrame
data['labels'] = labels_ma
# Mostrar las etiquetas asignadas
print(data['labels'])

GSM4043276_HCC004_HTA_2_0.CEL     cancer
GSM4043277_HCC011_HTA_2_0.CEL     cancer
GSM4043278_HCC013_HTA_2_0.CEL     cancer
GSM4043279_HCC015_HTA_2_0.CEL     cancer
GSM4043280_HCC016_HTA_2_0.CEL     cancer
                                  ...   
GSM4043340_NTL191_HTA_2_0.CEL    healthy
GSM4043341_NTL192_HTA_2_0.CEL    healthy
GSM4043342_NTL193_HTA_2_0.CEL    healthy
GSM4043343_NTL200_HTA_2_0.CEL    healthy
GSM4043344_NTL201_HTA_2_0.CEL    healthy
Name: labels, Length: 69, dtype: object


In [10]:
#borra todas las filas donde labels es unknown y cuenta cuantos hay
print(data.labels.value_counts())
data = data[data.labels != 'unknown']

cancer     39
healthy    30
Name: labels, dtype: int64


In [11]:
#guarda data en un archivo csv
data.to_csv(DATA_PATH + 'data_peru_prep.csv')