In [None]:
import math
import csv
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification

from sklearn.linear_model import (RandomizedLasso, RandomizedLogisticRegression)

import warnings

In [None]:
warnings.filterwarnings('ignore')
# read data
data = pd.read_csv("final_1.csv")

protein = data.values
data

x = protein[:,4:]
y = protein[:,2]

y

# feature extraction
model = LogisticRegression()
rfe = RFE(model, n_features_to_select = 5)
fit = rfe.fit(x, y)

print(rfe.support_)
print(rfe.ranking_)
print(x)

# write for loop to see which proteins appear the most * 100 times
# SVM: results will be stable
# split training and test 80-20
# diff training and test sets for each iteration
# test the selected 5 features on the test set- get accuracy
# use transform f(x) to transofmr feature in 5 dimensions to test it

In [None]:
print(rfe.support_)
print(rfe.ranking_)
protein[:,rfe.ranking_]
colname = data.columns[4+rfe.ranking_]
print (colname)


In [None]:
# rfe cv for feat selection

# Linear SVM
data = pd.read_csv("final_1_copy.csv")

data = data.values
data

X = data[:,1:]
y = data[:,0]


# estimator: linear SVM classifier
svc = SVC(kernel="linear")
rfecv_svc = RFECV(estimator=svc, step=1, scoring='accuracy') # n for nfolds?
rfecv_svc.fit(X,y)

print("Optimal number of features : %d" % rfecv_svc.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv_svc.grid_scores_) + 1), rfecv_svc.grid_scores_)
plt.show()

print(rfecv_svc.n_features_)
print(rfecv_svc.ranking_)

In [None]:
# Random Forest
data = pd.read_csv("final_1_copy.csv")

data = data.values
data

X = data[:,1:]
y = data[:,0]

rf = RandomForestClassifier(criterion='gini')
rfecv_rf = RFECV(rf, scoring='accuracy')
rfecv_rf.fit(X,y)


print("Optimal number of features : %d" % rfecv_rf.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv_rf.grid_scores_) + 1), rfecv_rf.grid_scores_)
plt.show()

print(rfecv_rf.n_features_)

In [None]:
rfecv_svc.ranking_
data = pd.read_csv("final_1_copy.csv")
colname = data.columns[1+rfecv_svc.ranking_]
print (colname)

In [None]:
rfecv_rf.ranking_
data = pd.read_csv("final_1_copy.csv")
colname = data.columns[1+rfecv_rf.ranking_]
print (colname)

In [None]:
data = pd.read_csv("final_1.csv")

protein = data.values
data

x = protein[:,4:]
y = protein[:,2]

protein_freq = []
train_accuracy = []
test_accuracy = []

# feature extraction

    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=i)
    svc = SVC(kernel="linear")
    rfe = RFE(svc, n_features_to_select = 5) #selects 5 features
    colname = data.columns[4+rfe.ranking_] #saves features in array
    rfe.fit(X_train,y_train)
    test_pred = rfe.predict(X_test)
    test_error = metrics.accuracy_score(y_test, test_pred, normalize=True)
    # colname = data.columns[4+rfe.ranking_]
    protein_freq.append(colname)
    #train_accuracy.append(train_error)
    test_accuracy.append(test_error)
    #print (colname)


In [None]:
# RFE
data = pd.read_csv("final_1.csv")

protein = data.values
data

x = protein[:,4:]
y = protein[:,2]

protein_freq = []
train_accuracy = []
test_accuracy = []

# feature extraction
for i in range(0,100):
#     train_error = 0
#     test_error = 0
    index = []
    X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=i)
    svc = SVC(kernel="linear")
    rfe = RFE(svc, n_features_to_select = 5) #selects 5 features

    # build model and test on testing set
    rfe.fit(X_train,y_train)
    test_pred = rfe.predict(X_test)
    test_score = metrics.accuracy_score(y_test, test_pred, normalize=True)
    
    # append 5 proteins & accuracy score
    index = rfe.ranking_[0:5]
    colname = data.columns[4+index]
    protein_freq.append(colname)
    test_accuracy.append(test_score)
    #print (colname)


In [None]:
for i in range(0,100):
    print("trial: %d" % (i+1))
    print(protein_freq[i])
    print(test_accuracy[i])

In [None]:
result = protein_freq
df = pd.DataFrame(np.array(protein_freq))
df.to_csv("results.csv")

score = pd.DataFrame(np.array(test_accuracy))
score.to_csv("score.csv")

In [None]:
results = pd.read_csv("combined.csv") #read in protein combinations & accuracy
results.sort_values(by='accuracy', ascending=False) #order accuracy in ascending order 
results[:43] #keep results with accuracy >= 0.75


In [None]:
# merge all proteins into a single list
protein_list = pd.read_csv("protein_list_filter.csv")

# get frequency count of each occurance
protein_list['protein'].value_counts()



In [None]:
# randomized lasso, sub sampling

data = pd.read_csv("final_1.csv")

protein = data.values
data
protein_names = list(data)[4:]

x = protein[:,4:]
y = protein[:,2]
x
y

        
stabsel = RandomizedLogisticRegression(selection_threshold = 0.75)
stabsel.fit(x, y)

print ("Features sorted by their score:")
print (sorted(zip(map(lambda x: round(x, 4), stabsel.scores_), 
                 protein_names), reverse=True))



In [None]:
# randomized lasso, sub sampling

data = pd.read_csv("final_1.csv")

protein = data.values
data
protein_names = list(data)[4:]

x = protein[:,4:]
y = protein[:,2]
x
y

##convert y to numeric values; 0 = ctrl, 1 = treatment
for i in range(0,84):
    if y[i] == 'ctrl':
        y[i] = 0
    else:
        y[i] = 1
        
stabsel = RandomizedLogisticRegression(selection_threshold=0.1)
stabsel.fit(x, y)


In [None]:
print (stabsel.get_support(indices=True))
index = stabsel.get_support(indices=True)
print ("Features sorted by their score:")
for i in index:
    print(stabsel.scores_[i], "    ", protein_names[i])
    
for i in index:
    print(protein_names[i])