In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize']=(15,8)

In [283]:
# 从文件中载入数据
df_xtrain = pd.read_table('14cancer.xtrain',sep='\s+',header=None, encoding='utf-8')
df_ytrain = pd.read_table('14cancer.ytrain',sep='\s+',header=None, encoding='utf-8')
df_xtest = pd.read_table('14cancer.xtest',sep='\s+',header=None, encoding='utf-8')
df_ytest = pd.read_table('14cancer.ytest',sep='\s+',header=None, encoding='utf-8')

In [284]:
# 查看数据的形状
# shape=(n_genes,n_samples),shape=(1,n_class)
print(df_xtrain.shape,df_ytrain.shape,df_xtest.shape,df_ytest.shape)

(2308, 63) (1, 63) (2308, 25) (1, 25)


In [285]:
# 对数据进行预览
df_xtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,53,54,55,56,57,58,59,60,61,62
0,0.773344,-0.078178,-0.084469,0.965614,0.075664,0.458816,0.067098,0.094128,0.108316,-0.970747,...,0.242476,0.5338,-1.940027,-1.051824,-0.968637,-2.683846,-1.207646,-1.684161,-2.258568,-1.146333
1,-2.438405,-2.415754,-1.649739,-2.380547,-1.728785,-2.875286,-1.624044,-1.795165,-1.944911,-2.347582,...,-2.859455,-1.529241,-2.877061,-2.887775,-2.748872,-2.117767,-2.47813,-2.053384,-2.308603,-3.007805
2,-0.482562,0.412772,-0.241307,0.625297,0.852626,0.135841,0.519627,0.702751,0.600099,-0.392006,...,0.260362,0.436059,0.071297,0.640853,0.333683,-0.523236,0.09431,0.563835,-1.443076,-0.029326
3,-2.721135,-2.825146,-2.875286,-1.741256,0.272695,0.405398,0.238229,0.206038,-0.051083,-0.141218,...,0.361234,0.583779,-2.121932,-2.085057,-1.144133,-2.174192,0.273456,0.314446,0.233094,0.237835
4,-1.217058,-0.626236,-0.889405,-0.845366,-1.84137,-2.082647,-1.397558,-1.868209,-1.981952,-1.82325,...,-2.69711,-1.648179,-0.932674,-1.087079,-0.77219,-0.448947,-1.113218,-2.052605,-1.779633,-2.859455


In [286]:
# 对数据进行初步清洗，消除nan值
def drop_df_nan(df_x,df_y): 
    # x: dataframe shape=(m,n)   n: the number of samples
    # y: dataframe shape=(1,n)
    # 删除样本中那些包含nan值的样本（列）
    nan_x_index = np.where(np.isnan(df_x))   # 返回np.nan值所在的行和列 （rows_index，columns_index）
    nan_y_index = np.where(np.isnan(df_y)) 
    nan_rows = np.union1d(nan_x_index[0],nan_y_index[0])   # 求取两个并集
    nan_cols = np.union1d(nan_x_index[1],nan_y_index[1])
    #print(list(df_x.columns[nan_cols]),type(list(df_x.columns[nan_cols])))
    if len(nan_cols)!= 0:              # 如果返回的列索引值非空，即存在某些列存在np.nan值，我们进行删除,注意drop不改变元对象
        df_x=df_x.drop(columns=nan_cols)
        df_y=df_y.drop(columns=nan_cols)
    return df_x,df_y
df_xtrain,df_ytrain = drop_df_nan(df_xtrain, df_ytrain)
df_xtest, df_ytest = drop_df_nan(df_xtest, df_ytest)

In [287]:
# 按照机器学习中的常规习惯，我们将数据进行转置，使shape=(n_samples,d)
df_xtrain = df_xtrain.T
df_xtest = df_xtest.T

In [288]:
# 观察调整后的数据
df_xtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,0.565427,...,-0.238511,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172
1,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,0.159053,...,-0.657394,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395
2,-0.084469,-1.649739,-0.241307,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,0.496585,...,-0.696352,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434
3,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,0.994732,...,0.259746,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483
4,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,0.278313,...,-0.200404,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966


In [289]:
df_xtest.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,0.139501,-1.168927,0.564973,-3.366796,-1.323132,-0.692547,2.327395,0.923703,0.112167,0.509765,...,-0.180803,-0.942635,-1.210662,-0.588787,-0.070422,-2.783852,-2.840439,-1.160913,-0.343054,-0.055513
1,1.164275,-2.018158,1.103533,-2.165435,-1.440117,-0.43742,2.661587,1.224011,0.210504,1.045563,...,-0.70948,-1.53294,-2.385967,-0.389641,0.422781,-2.81675,-2.422495,-1.722607,-1.703749,-1.69991
3,0.841093,0.25472,-0.208748,-2.148149,-1.512765,-1.263723,2.946642,0.087828,0.48292,1.06302,...,-0.067958,-1.85406,-1.541312,-1.773723,-1.879935,-2.265289,-2.405726,-0.176379,-0.128743,-0.996417
5,0.685065,-1.927579,-0.233068,-1.640413,-1.008954,0.774451,1.617168,-0.567925,0.036621,-0.101701,...,1.077559,-0.263966,-1.966113,-1.08619,0.885914,-0.24859,0.385874,-0.508163,-0.626985,-0.699366
6,-1.956163,-2.234926,0.281563,-2.695628,-1.214697,-1.059872,2.49807,0.780196,1.041583,0.7275,...,-1.20932,-0.693147,-1.846427,-0.993442,-3.294138,-3.332605,-2.282782,-0.656622,-2.012157,-1.668657


In [290]:
# 将每个样本的类别添加到dataframe的第1列
df_xtrain.insert(0,'Type',df_ytrain.T)
df_xtest.insert(0,'Type',df_ytest.T)

In [291]:
df_xtrain.head()

Unnamed: 0,Type,0,1,2,3,4,5,6,7,8,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,2,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,...,-0.238511,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172
1,2,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,...,-0.657394,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395
2,2,-0.084469,-1.649739,-0.241307,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,...,-0.696352,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434
3,2,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,...,0.259746,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483
4,2,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,...,-0.200404,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966


In [292]:
df_xtrain.head()

Unnamed: 0,Type,0,1,2,3,4,5,6,7,8,...,2298,2299,2300,2301,2302,2303,2304,2305,2306,2307
0,2,0.773344,-2.438405,-0.482562,-2.721135,-1.217058,0.827809,1.342604,0.057042,0.133569,...,-0.238511,-0.027474,-1.660205,0.588231,-0.463624,-3.952845,-5.496768,-1.414282,-0.6476,-1.763172
1,2,-0.078178,-2.415754,0.412772,-2.825146,-0.626236,0.054488,1.429498,-0.120249,0.456792,...,-0.657394,-0.246284,-0.836325,-0.571284,0.034788,-2.47813,-3.661264,-1.093923,-1.20932,-0.824395
2,2,-0.084469,-1.649739,-0.241307,-2.875286,-0.889405,-0.027474,1.1593,0.015676,0.191942,...,-0.696352,0.024985,-1.059872,-0.403767,-0.678653,-2.939352,-2.73645,-1.965399,-0.805868,-1.139434
3,2,0.965614,-2.380547,0.625297,-1.741256,-0.845366,0.949687,1.093801,0.819736,-0.28462,...,0.259746,0.357115,-1.893128,0.255107,0.163309,-1.021929,-2.077843,-1.127629,0.331531,-2.179483
4,2,0.075664,-1.728785,0.852626,0.272695,-1.84137,0.327936,1.251219,0.77145,0.030917,...,-0.200404,0.061753,-2.273998,-0.039365,0.368801,-2.566551,-1.675044,-1.08205,-0.965218,-1.836966


In [297]:
print('a')

a
