# Predicting GenoType based on protein expression using SVM

In [212]:
import pandas as pd
import numpy as  np

In [213]:
data = pd.read_csv('data.csv')

In [214]:
data.columns

Index(['MouseID', 'DYRK1A_N', 'ITSN1_N', 'BDNF_N', 'NR1_N', 'NR2A_N', 'pAKT_N',
       'pBRAF_N', 'pCAMKII_N', 'pCREB_N', 'pELK_N', 'pERK_N', 'pJNK_N',
       'PKCA_N', 'pMEK_N', 'pNR1_N', 'pNR2A_N', 'pNR2B_N', 'pPKCAB_N',
       'pRSK_N', 'AKT_N', 'BRAF_N', 'CAMKII_N', 'CREB_N', 'ELK_N', 'ERK_N',
       'GSK3B_N', 'JNK_N', 'MEK_N', 'TRKA_N', 'RSK_N', 'APP_N', 'Bcatenin_N',
       'SOD1_N', 'MTOR_N', 'P38_N', 'pMTOR_N', 'DSCR1_N', 'AMPKA_N', 'NR2B_N',
       'pNUMB_N', 'RAPTOR_N', 'TIAM1_N', 'pP70S6_N', 'NUMB_N', 'P70S6_N',
       'pGSK3B_N', 'pPKCG_N', 'CDK5_N', 'S6_N', 'ADARB1_N', 'AcetylH3K9_N',
       'RRP1_N', 'BAX_N', 'ARC_N', 'ERBB4_N', 'nNOS_N', 'Tau_N', 'GFAP_N',
       'GluR3_N', 'GluR4_N', 'IL1B_N', 'P3525_N', 'pCASP9_N', 'PSD95_N',
       'SNCA_N', 'Ubiquitin_N', 'pGSK3B_Tyr216_N', 'SHH_N', 'BAD_N', 'BCL2_N',
       'pS6_N', 'pCFOS_N', 'SYP_N', 'H3AcK18_N', 'EGR1_N', 'H3MeK4_N',
       'CaNA_N', 'Genotype', 'Treatment', 'Behavior', 'class'],
      dtype='object')

In [215]:
data = data.drop(['MouseID','Treatment', 'Behavior', 'Genotype'],axis=1)

In [216]:
data.isnull().sum()

DYRK1A_N             3
ITSN1_N              3
BDNF_N               3
NR1_N                3
NR2A_N               3
pAKT_N               3
pBRAF_N              3
pCAMKII_N            3
pCREB_N              3
pELK_N               3
pERK_N               3
pJNK_N               3
PKCA_N               3
pMEK_N               3
pNR1_N               3
pNR2A_N              3
pNR2B_N              3
pPKCAB_N             3
pRSK_N               3
AKT_N                3
BRAF_N               3
CAMKII_N             3
CREB_N               3
ELK_N               18
ERK_N                3
GSK3B_N              3
JNK_N                3
MEK_N                7
TRKA_N               3
RSK_N                3
                  ... 
S6_N                 0
ADARB1_N             0
AcetylH3K9_N         0
RRP1_N               0
BAX_N                0
ARC_N                0
ERBB4_N              0
nNOS_N               0
Tau_N                0
GFAP_N               0
GluR3_N              0
GluR4_N              0
IL1B_N     

In [217]:
columns = data.columns
X_data = data[columns[:-1]]
y_data = data[columns[-1]]
X_data.replace('',np.NaN,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [218]:
from sklearn.preprocessing import Imputer

In [219]:
imputer = Imputer()

In [220]:
imputer.fit(X_data)

Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)

In [221]:
X_data = pd.DataFrame(columns=X_data.columns,data=imputer.transform(X_data))

In [222]:
X_data.isnull().sum()

DYRK1A_N           0
ITSN1_N            0
BDNF_N             0
NR1_N              0
NR2A_N             0
pAKT_N             0
pBRAF_N            0
pCAMKII_N          0
pCREB_N            0
pELK_N             0
pERK_N             0
pJNK_N             0
PKCA_N             0
pMEK_N             0
pNR1_N             0
pNR2A_N            0
pNR2B_N            0
pPKCAB_N           0
pRSK_N             0
AKT_N              0
BRAF_N             0
CAMKII_N           0
CREB_N             0
ELK_N              0
ERK_N              0
GSK3B_N            0
JNK_N              0
MEK_N              0
TRKA_N             0
RSK_N              0
                  ..
CDK5_N             0
S6_N               0
ADARB1_N           0
AcetylH3K9_N       0
RRP1_N             0
BAX_N              0
ARC_N              0
ERBB4_N            0
nNOS_N             0
Tau_N              0
GFAP_N             0
GluR3_N            0
GluR4_N            0
IL1B_N             0
P3525_N            0
pCASP9_N           0
PSD95_N      

In [223]:
from sklearn.model_selection  import train_test_split

In [224]:
X_train, X_test, y_train, y_test = train_test_split( X_data, y_data, test_size=0.2)

In [225]:
from sklearn.svm import SVC
from sklearn.grid_search import RandomizedSearchCV
import scipy

In [226]:
clf  =  SVC()

In [227]:
s = RandomizedSearchCV(clf,param_distributions={'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1),
  'kernel': ['rbf','linear']},)

In [228]:
s.fit(X_train,y_train)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f887f3ddc88>, 'kernel': ['rbf', 'linear'], 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f887f3ddb70>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)

In [229]:
print("Train score ",s.score(X_train,y_train))

Train score  1.0


In [230]:
print("Test score ",s.score(X_test,y_test))

Test score  1.0


In [231]:
s.best_params_

{'C': 255.46499718570018, 'gamma': 0.039187436315744249, 'kernel': 'rbf'}

In [232]:
s.predict(X_test[:10])

array(['c-CS-m', 't-SC-m', 't-CS-s', 't-CS-m', 'c-CS-s', 't-SC-m',
       't-CS-m', 't-SC-m', 'c-SC-s', 't-SC-s'], dtype=object)

In [233]:
y_test[:10]

37     c-CS-m
793    t-SC-m
935    t-CS-s
693    t-CS-m
391    c-CS-s
782    t-SC-m
603    t-CS-m
786    t-SC-m
524    c-SC-s
959    t-SC-s
Name: class, dtype: object

In [267]:
random_indices = np.random.randint(0,X_plot_data.shape[0],)

In [268]:
random_indices

2