In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=(15,8)

In [3]:
# 从文件中载入数据
df_xtrain = pd.read_table('14cancer.xtrain',sep='\s+',header=None, encoding='utf-8')
df_ytrain = pd.read_table('14cancer.ytrain',sep='\s+',header=None, encoding='utf-8')
df_xtest = pd.read_table('14cancer.xtest',sep='\s+',header=None, encoding='utf-8')
df_ytest = pd.read_table('14cancer.ytest',sep='\s+',header=None, encoding='utf-8')

In [4]:
# 查看数据的形状
# shape=(n_genes,n_samples),shape=(1,n_class)
print(df_xtrain.shape,df_ytrain.shape,df_xtest.shape,df_ytest.shape)

(16063, 144) (1, 144) (16063, 54) (1, 54)


In [5]:
# 对数据进行预览
df_xtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,134,135,136,137,138,139,140,141,142,143
0,-73,-16,4,-31,-33,-37,-18,-26,-40,22,...,274,-96,-96,-124,-201,-196,34,-56,-245,-26
1,-69,-63,-45,-110,-39,-90,28,-23,-264,-14,...,-915,-221,-458,-664,-259,-369,-81,-818,-235,-1595
2,-48,-97,-112,-20,-45,-75,10,2,-335,-21,...,-303,-119,-134,-361,22,-263,-146,-1338,-127,-2085
3,13,-42,-25,-50,14,-46,30,34,18,26,...,29,243,109,21,140,162,-151,-57,197,-334
4,-86,-91,-85,-115,-56,-45,-56,-54,-163,-42,...,-171,-224,-630,-519,-277,-277,-174,-989,-562,-455


In [6]:
# 对数据进行初步清洗，消除nan值
def drop_df_nan(df_x,df_y): 
    # x: dataframe shape=(m,n)   n: the number of samples
    # y: dataframe shape=(1,n)
    # 删除样本中那些包含nan值的样本（列）
    nan_x_index = np.where(np.isnan(df_x))   # 返回np.nan值所在的行和列 （rows_index，columns_index）
    nan_y_index = np.where(np.isnan(df_y)) 
    nan_rows = np.union1d(nan_x_index[0],nan_y_index[0])   # 求取两个并集
    nan_cols = np.union1d(nan_x_index[1],nan_y_index[1])
    #print(list(df_x.columns[nan_cols]),type(list(df_x.columns[nan_cols])))
    if len(nan_cols)!= 0:              # 如果返回的列索引值非空，即存在某些列存在np.nan值，我们进行删除,注意drop不改变元对象
        df_x=df_x.drop(columns=nan_cols)
        df_y=df_y.drop(columns=nan_cols)
    return df_x,df_y
df_xtrain,df_ytrain = drop_df_nan(df_xtrain, df_ytrain)
df_xtest, df_ytest = drop_df_nan(df_xtest, df_ytest)

In [7]:
# 按照机器学习中的常规习惯，我们将数据进行转置，使shape=(n_samples,d)
df_xtrain = df_xtrain.T
df_xtest = df_xtest.T

In [8]:
# 观察调整后的数据
df_xtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16053,16054,16055,16056,16057,16058,16059,16060,16061,16062
0,-73.0,-69.0,-48.0,13.0,-86.0,-147.0,-65.0,-71.0,-32.0,100.0,...,-134.0,352.0,-67.0,121.0,-5.0,-11.0,-21.0,-41.0,-967.0,-120.0
1,-16.0,-63.0,-97.0,-42.0,-91.0,-164.0,-53.0,-77.0,-17.0,122.0,...,-51.0,244.0,-15.0,119.0,-32.0,4.0,-14.0,-28.0,-205.0,-31.0
2,4.0,-45.0,-112.0,-25.0,-85.0,-127.0,56.0,-110.0,81.0,41.0,...,14.0,163.0,-14.0,7.0,15.0,-8.0,-104.0,-36.0,-245.0,34.0
3,-31.0,-110.0,-20.0,-50.0,-115.0,-113.0,-17.0,-40.0,-17.0,80.0,...,26.0,625.0,18.0,59.0,-10.0,32.0,-2.0,10.0,-495.0,-37.0
4,-33.0,-39.0,-45.0,14.0,-56.0,-106.0,73.0,-34.0,18.0,64.0,...,-69.0,398.0,38.0,215.0,-2.0,44.0,3.0,68.0,-293.0,-34.0


In [9]:
df_xtest.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16053,16054,16055,16056,16057,16058,16059,16060,16061,16062
0,-44.0,-254.0,-166.0,-55.0,-130.0,-133.0,80.0,-62.0,1.0,22.0,...,-51.0,715.0,25.0,47.0,-270.0,-265.0,-290.0,-235.0,-826.0,-262.0
1,-13.0,-124.0,-206.0,-29.0,-117.0,-114.0,111.0,-20.0,-25.0,18.0,...,-57.0,695.0,10.0,57.0,-50.0,-45.0,-61.0,-22.0,-263.0,-52.0
2,-64.3,-184.9,-334.4,-102.2,-289.0,-145.2,-251.9,-150.9,-184.0,179.1,...,-173.4,1324.6,9.8,127.7,244.0,76.5,100.1,-69.6,-1444.3,-121.6
3,-22.0,-169.0,-164.0,-32.0,-49.0,-141.0,0.0,-22.0,16.0,67.0,...,21.0,170.0,20.0,35.0,-275.0,-230.0,-136.0,-228.0,-404.0,-259.0
4,-28.0,-167.0,-158.0,100.0,-129.0,-232.0,148.0,-48.0,14.0,164.0,...,-66.0,1627.0,76.0,119.0,-6.0,68.0,-22.0,27.0,-1088.0,-58.0


In [290]:
# 将每个样本的类别添加到dataframe的第1列
df_xtrain.insert(0,'Type',df_ytrain.T)
df_xtest.insert(0,'Type',df_ytest.T)

In [291]:
df_xtrain.head()

Unnamed: 0,Type,0,1,2,3,4,5,6,7,8,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,2,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,...,-0.238511,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172
1,2,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,...,-0.657394,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395
2,2,-0.084469,-1.649739,-0.241307,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,...,-0.696352,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434
3,2,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,...,0.259746,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483
4,2,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,...,-0.200404,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966


In [292]:
df_xtrain.head()

Unnamed: 0,Type,0,1,2,3,4,5,6,7,8,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,2,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,...,-0.238511,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172
1,2,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,...,-0.657394,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395
2,2,-0.084469,-1.649739,-0.241307,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,...,-0.696352,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434
3,2,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,...,0.259746,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483
4,2,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,...,-0.200404,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966


In [297]:
print('a')

a
