In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve,roc_curve,auc
from sklearn.feature_selection import RFE

In [2]:
def get_characterstic_cols(cols):
    char_cols = []
    char_cols.append(cols[1].split(':')[0].strip().strip('"'))
    for col in cols[1:]:
        char_cols.append(col.split(':')[1].strip().strip('"'))
    return char_cols

In [3]:
def read_header_data(headers):
    header_df = pd.DataFrame()
    for header in headers:
        header_cols = header.split('\t')
        if(len(header_cols) < 157):
            continue
        if header_cols[0] == '!Sample_characteristics_ch1':
            cols = get_characterstic_cols(header_cols)
            header_df[cols[0]] = cols[1:]
            header_df[cols[0]] = header_df[cols[0]].astype('category')
            header_df[cols[0]] = header_df[cols[0]].cat.codes
        '''else:
            col_header = header_cols[0].replace('!Sample_','').replace('_ch1','').strip()
            header_df[col_header] = header_cols[1:]
            header_df[col_header] = header_df[col_header].str.strip().str.strip('"')'''
    return header_df   

In [4]:
def read_series_matrix(series_matrix):
    series_matrix_df = pd.DataFrame()
    for line in series_matrix:
        cols = line.split('\t')
        if(len(cols) < 157):
            continue
        col_header = cols[0].strip().strip('"')
        series_matrix_df[col_header] = cols[1:]
        series_matrix_df[col_header] = series_matrix_df[col_header].str.strip().str.strip('"')
    return series_matrix_df   
    

In [5]:
def prepare_data(data):
    data = data.sample(frac=1).reset_index(drop=True)
    msk = np.random.rand(len(data)) <= 0.9
    train_data = data[msk]
    test_data = data[~msk]
    return train_data, test_data

In [6]:
def get_features(train_data, test_data):
    x_train = train_data.loc[:, train_data.columns != 'tissue type']
    x_test = test_data.loc[:, test_data.columns != 'tissue type']
    y = train_data['tissue type'].values
    y_out = test_data['tissue type'].values
    return x_train, y, x_test, y_out

In [7]:
def get_precision_recall(y_out, model_pred, model_name):
    tn, fp, fn, tp =confusion_matrix(y_out, model_pred).ravel()
    print("tp:{},fn:{}".format(tp,fn))
    print("fp:{},tn:{}".format(fp,tn))
    print("recall pos:{:0.2f}%, recall neg:{:0.2f}%".format(100*tp/(tp+fn),100*tn/(tn+fp)))
    print("precision pos:{:0.2f}%, precision neg:{:0.2f}%".format(100*tp/(tp+fp),100*tn/(tn+fn)))

    average_precision = average_precision_score(y_out, model_pred)
    print("Gmean :{:0.2f}%".format( 100*(tp/(tp+fn))*(tn/(tn+fp))))
    print('Average precision-recall score: {0:0.2f}'.format( average_precision))
    precision, recall, _ = precision_recall_curve(y_out, model_pred)

    #plot_pr_curve(precision, recall, average_precision, model_name);
    #plot_roc_auc(y_out, model_pred, filename, model_name);

In [8]:
def train_test_model(model, x_train, y, x_test, y_out, model_name):
    model.fit(x_train, y)
    model_pred =  model.predict(x_test)
    print('\nConfusion matrix\n',confusion_matrix(y_out,model_pred))
    print(classification_report(y_out,model_pred))
    get_precision_recall(y_out, model_pred, model_name)

In [9]:
def feature_importance(model, x_train, y):
    rfe = RFE(model, 3)
    rfe = rfe.fit(x_train, y)
    # summarize the selection of the attributes
    
    print(rfe.support_)
    print(x_train.columns)
    print(rfe.ranking_)

In [10]:
def process_from_file(file_name):
    count = 0
    line = open(file_name).readlines()
    header_df = read_header_data(line[:68])
    series_matrix_df = read_series_matrix(line[69:100])

    features_df = pd.concat([header_df, series_matrix_df], axis=1)
    print(features_df.head(2))
    train_data, test_data = prepare_data(features_df)
    x_train, y, x_test, y_out = get_features(train_data, test_data)
    print('Model: LogisticRegression')
    lr = LogisticRegression(C=1.2, class_weight='balanced')
    feature_importance(lr, x_train, y)
    

In [11]:
process_from_file('GSE19188_series_matrix.txt')

   tissue type  cell type  overall survival  status  gender     1007_s_at  \
0            1          1                12       2       1   0.278536376   
1            0          3                80       0       2  -0.005101812   

        1053_at        117_at       121_at     1255_g_at      ...       \
0   0.460966633  -0.272634196  0.293016021   1.536637393      ...        
1  -0.800087059  -1.059394824  0.053426368  -0.141373795      ...        

     1552271_at  1552272_a_at    1552274_at  1552275_s_at  1552276_a_at  \
0    0.12431423  -0.013780876  -1.138924661  -1.423838841   0.539265001   
1  -0.272252398  -0.074565112  -0.449307949  -0.518601288  -0.174576768   

   1552277_a_at  1552278_a_at  1552279_a_at    1552280_at    1552281_at  
0  -0.074060128   0.078355844  -0.036655156  -0.161718138  -0.346817575  
1   -0.59022284  -0.411005264  -0.372878809  -0.387796987   0.129773142  

[2 rows x 36 columns]
Model: LogisticRegression
[ True False  True False False False False False

