# Support Vector Machine (SVM)

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_excel(r'Cancer_Data.xlsx', sheet_name='Table S7-Variants for Analysis')

In [3]:
dataset.head()

Unnamed: 0,chrom,pos_start,pos_end,ref,alt,labId,tumor_only,genotyper,total_reads,allele_reads,...,amino_acids,codons,existing_variation,variant_class,sift,polyphen,exac_af,short_aa_change,validation,rna_status
0,1,914477,914477,C,T,14-00141,0,mutect,33,17,...,A/T,Gcc/Acc,,SNV,tolerated(0.12),benign(0.002),,p.A551T,not_covered,not_covered
1,1,914477,914477,C,T,14-00141,0,varscan,31,17,...,A/T,Gcc/Acc,,SNV,tolerated(0.12),benign(0.002),,p.A551T,not_covered,not_covered
2,1,914941,914941,G,A,15-00492,0,mutect,74,12,...,A/V,gCa/gTa,,SNV,tolerated(0.11),benign(0.003),,p.A396V,not_done,not_done
3,1,914941,914941,G,A,15-00492,0,varscan,85,19,...,A/V,gCa/gTa,,SNV,tolerated(0.11),benign(0.003),,p.A396V,not_done,not_done
4,1,982281,982281,G,T,14-00676,0,varscan,139,13,...,G/V,gGc/gTc,,SNV,deleterious(0),probably_damaging(0.972),,p.G1111V,not_done,not_found


In [4]:
dataset2 = pd.DataFrame(dataset, columns=['labId', 'symbol'])
dataset2.head()

Unnamed: 0,labId,symbol
0,14-00141,C1orf170
1,14-00141,C1orf170
2,15-00492,C1orf170
3,15-00492,C1orf170
4,14-00676,AGRN


In [5]:
dataset_y = pd.read_excel(r'Cancer_Data.xlsx', sheet_name='Table S10-Drug Responses')

In [6]:
dataset_y.head()

Unnamed: 0,inhibitor,lab_id,ic50,auc
0,17-AAG (Tanespimycin),12-00211,10.0,225.918025
1,17-AAG (Tanespimycin),12-00219,0.276661,135.264409
2,17-AAG (Tanespimycin),12-00258,2.722845,164.561227
3,17-AAG (Tanespimycin),12-00262,0.123136,111.555971
4,17-AAG (Tanespimycin),12-00268,10.0,226.805281


In [7]:
dataset_y_2 = pd.DataFrame(dataset_y, columns=['inhibitor','lab_id', 'auc'])
dataset_y_2.head()

Unnamed: 0,inhibitor,lab_id,auc
0,17-AAG (Tanespimycin),12-00211,225.918025
1,17-AAG (Tanespimycin),12-00219,135.264409
2,17-AAG (Tanespimycin),12-00258,164.561227
3,17-AAG (Tanespimycin),12-00262,111.555971
4,17-AAG (Tanespimycin),12-00268,226.805281


In [8]:
dataset_y_3 = dataset_y_2.rename(columns = {'lab_id': 'labId'})
dataset_y_3.head()

Unnamed: 0,inhibitor,labId,auc
0,17-AAG (Tanespimycin),12-00211,225.918025
1,17-AAG (Tanespimycin),12-00219,135.264409
2,17-AAG (Tanespimycin),12-00258,164.561227
3,17-AAG (Tanespimycin),12-00262,111.555971
4,17-AAG (Tanespimycin),12-00268,226.805281


In [9]:
new_dataset = pd.merge(dataset2,dataset_y_3, on='labId')
new_dataset

Unnamed: 0,labId,symbol,inhibitor,auc
0,14-00141,C1orf170,A-674563,211.587548
1,14-00141,C1orf170,ABT-737,232.461812
2,14-00141,C1orf170,AT7519,202.207433
3,14-00141,C1orf170,AZD1480,230.673175
4,14-00141,C1orf170,Afatinib (BIBW-2992),178.365200
5,14-00141,C1orf170,Alisertib (MLN8237),173.915232
6,14-00141,C1orf170,Axitinib (AG-013736),201.516737
7,14-00141,C1orf170,BEZ235,189.090631
8,14-00141,C1orf170,BI-2536,174.906594
9,14-00141,C1orf170,BMS-345541,237.367844


# Most frequent inhibitors

In [10]:
q = new_dataset.groupby(['inhibitor']).count()
q.sort_values(by='auc', ascending = False).head()

Unnamed: 0_level_0,labId,symbol,auc
inhibitor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Imatinib,5498,5498,5498
Crizotinib (PF-2341066),5413,5413,5413
Sorafenib,5404,5404,5404
Sunitinib,5393,5393,5393
Dasatinib,5381,5381,5381


# 1st Inhibitor

In [11]:
df = new_dataset[new_dataset['inhibitor']=='Imatinib']
df.head()

Unnamed: 0,labId,symbol,inhibitor,auc
38,14-00141,C1orf170,Imatinib,195.146126
133,14-00141,C1orf170,Imatinib,195.146126
228,14-00141,FLT3,Imatinib,195.146126
323,14-00141,CA10,Imatinib,195.146126
418,14-00141,CA10,Imatinib,195.146126


In [12]:
df_y = df.groupby(['labId'])['auc'].min().reset_index()
df_y.head()

Unnamed: 0,labId,auc
0,11-00261,138.643576
1,12-00069,224.588172
2,12-00127,161.467962
3,12-00362,200.812053
4,12-00371,218.763793


In [13]:
df_y.shape

(353, 2)

In [14]:
df['new'] = 1
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,labId,symbol,inhibitor,auc,new
38,14-00141,C1orf170,Imatinib,195.146126,1
133,14-00141,C1orf170,Imatinib,195.146126,1
228,14-00141,FLT3,Imatinib,195.146126,1
323,14-00141,CA10,Imatinib,195.146126,1
418,14-00141,CA10,Imatinib,195.146126,1


In [15]:
df_x = df.pivot_table(index = ["labId"], columns ="symbol", values = ["new"], fill_value=0)
df_x

Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
symbol,A1CF,A2M,A4GALT,AADAC,AADACL3,AADACL4,AAK1,AARS2,AASS,ABCA12,...,ZNF98,ZNHIT1,ZNRF4,ZRANB1,ZRSR2,ZSCAN1,ZSCAN2,ZSWIM6,ZXDB,ZZEF1
labId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11-00261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00098,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
13-00106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
df_x.shape

(353, 2092)

In [17]:
X = df_x.iloc[:,0:-1].values

In [18]:
X.shape

(353, 2091)

In [19]:
data_y = df_y.iloc[:,-1].values
data_y

array([138.64357605, 224.58817201, 161.46796181, 200.81205251,
       218.76379334, 234.33648784, 275.03382723, 259.4706018 ,
       236.37979754, 219.71982818, 211.82356126, 215.41465221,
       246.1016214 , 254.72824919, 216.55996704, 229.65845403,
       248.69785591, 233.24305385, 241.83551399, 251.97161089,
       236.83043794, 260.23177452, 282.3603614 , 184.9155109 ,
       238.20006629, 207.15548905, 178.15122116, 206.70499083,
       246.5616602 , 232.06885066, 147.31767328, 222.06749443,
       236.54445363, 251.6817137 , 268.42981315, 224.81938631,
       236.04440486, 213.20558729, 209.43848827, 229.09358433,
       200.73904026, 208.5788773 , 260.07479049, 246.44492132,
       237.50709246, 225.48427173, 239.65784947, 235.6260637 ,
       256.49495365, 256.53543435, 244.95129898, 215.46687336,
       259.91035549, 188.66923689, 214.05457127, 215.21142264,
       257.17597489, 205.89173127, 271.21949919, 253.61070252,
       252.24662279, 272.65910635, 232.38608505, 223.41

In [20]:
y = []
for i in data_y:
    if i >= 100:
        y.append(1)
    else:
        y.append(0)

In [21]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [22]:
(unique, counts) = np.unique(y, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[  0   4]
 [  1 349]]


For first inhibitor we have 349 values >= 100 and 4 values < 100

## Splitting the dataset into the Training set and Test set

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [24]:
print(X_train)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [25]:
print(X_test)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Training the SVM model on the Training set

In [26]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

## Making the Confusion Matrix

In [27]:
y_pred = classifier.predict(X_test)

In [28]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 0  1]
 [ 0 70]]


0.9859154929577465

# For 2nd inhibitor

In [29]:
df = new_dataset[new_dataset['inhibitor']=='Crizotinib (PF-2341066)']
df.head()

Unnamed: 0,labId,symbol,inhibitor,auc
19,14-00141,C1orf170,Crizotinib (PF-2341066),201.69459
114,14-00141,C1orf170,Crizotinib (PF-2341066),201.69459
209,14-00141,FLT3,Crizotinib (PF-2341066),201.69459
304,14-00141,CA10,Crizotinib (PF-2341066),201.69459
399,14-00141,CA10,Crizotinib (PF-2341066),201.69459


In [31]:
df_y = df.groupby(['labId'])['auc'].min().reset_index()
df_y

Unnamed: 0,labId,auc
0,12-00069,231.442046
1,12-00127,200.220892
2,12-00362,177.629223
3,12-00371,196.302735
4,13-00098,213.021926
5,13-00106,275.033827
6,13-00118,251.617545
7,13-00149,214.238500
8,13-00160,177.378224
9,13-00163,161.247265


In [32]:
df_y.shape

(344, 2)

In [34]:
df['new'] = 1
df_x = df.pivot_table(index = ["labId"], columns ="symbol", values = ["new"], fill_value=0)
df_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
symbol,A1CF,A2M,A4GALT,AADAC,AADACL3,AADACL4,AARS2,ABCA12,ABCA2,ABCA9,...,ZNF98,ZNHIT1,ZNRF4,ZRANB1,ZRSR2,ZSCAN1,ZSCAN2,ZSWIM6,ZXDB,ZZEF1
labId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12-00069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00098,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
13-00106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13-00163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
df_x.shape

(344, 2076)

In [36]:
X = df_x.iloc[:,0:-1].values

In [37]:
X.shape

(344, 2075)

In [38]:
data_y = df_y.iloc[:,-1].values
data_y

array([231.44204633, 200.22089212, 177.62922251, 196.30273525,
       213.02192625, 275.03382723, 251.61754458, 214.23849956,
       177.37822352, 161.24726462, 201.28077173, 253.55995525,
       194.04711867, 237.32361503, 209.94431948, 246.56290385,
       196.50822463, 237.82299125, 199.53778603, 193.00914656,
       247.04842777, 246.42534299, 193.59919086, 214.48513669,
       181.38401521, 192.07921288, 182.36247163, 214.54030056,
       174.7012837 , 133.0148258 , 176.21976435, 242.60663106,
       222.97406317, 239.56272709, 285.07093892, 208.97346466,
       199.79222579, 179.82336436, 199.74170613, 227.07150499,
       205.46759594, 239.92189259, 149.7248574 , 185.63972785,
       219.69239842, 221.04256597, 242.17269638, 232.78842342,
       233.00152733, 206.55108388, 214.50094816, 218.12810222,
       236.85127073, 160.04755748, 172.37200077, 225.78134048,
       204.25715094, 225.65410737, 227.34690411, 276.46153272,
       216.56942737, 209.12128895, 285.77938831, 230.81

In [39]:
y = []
for i in data_y:
    if i >= 100:
        y.append(1)
    else:
        y.append(0)

In [40]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [41]:
(unique, counts) = np.unique(y, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[  0   3]
 [  1 341]]


In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [43]:
from sklearn.svm import SVC
classifier2 = SVC(kernel = 'linear', random_state = 0)
classifier2.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [44]:
y_pred = classifier2.predict(X_test)

In [45]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 0  1]
 [ 0 68]]


0.9855072463768116

# 3rd inhibitor

In [46]:
df = new_dataset[new_dataset['inhibitor']=='Sorafenib']
df.head()

Unnamed: 0,labId,symbol,inhibitor,auc
82,14-00141,C1orf170,Sorafenib,144.207481
177,14-00141,C1orf170,Sorafenib,144.207481
272,14-00141,FLT3,Sorafenib,144.207481
367,14-00141,CA10,Sorafenib,144.207481
462,14-00141,CA10,Sorafenib,144.207481


In [48]:
df_y = df.groupby(['labId'])['auc'].min().reset_index()
df_y.head()

Unnamed: 0,labId,auc
0,11-00261,232.933886
1,12-00069,206.03032
2,12-00127,211.000967
3,12-00362,157.411854
4,12-00371,153.933795


In [49]:
df_y.shape

(349, 2)

In [51]:
df['new'] = 1
df_x = df.pivot_table(index = ["labId"], columns ="symbol", values = ["new"], fill_value=0)
df_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
symbol,A1CF,A2M,A4GALT,AADAC,AADACL3,AADACL4,AAK1,AARS2,AASS,ABCA12,...,ZNF85,ZNF98,ZNHIT1,ZNRF4,ZRANB1,ZRSR2,ZSCAN2,ZSWIM6,ZXDB,ZZEF1
labId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11-00261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00098,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
13-00106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
df_x.shape

(349, 2073)

In [53]:
X = df_x.iloc[:,0:-1].values

In [54]:
X.shape

(349, 2072)

In [55]:
data_y = df_y.iloc[:,-1].values
data_y

array([232.93388628, 206.03032034, 211.0009669 , 157.41185366,
       153.93379512, 198.19361226, 245.96638241, 234.69046523,
       193.40984412, 197.98805159, 184.85868883, 186.96027519,
        91.49419791, 236.58118621, 223.9289436 , 218.00239343,
       146.15236115, 212.38080071, 222.08331658, 215.01014781,
       206.73004644, 253.1469077 , 160.70873446, 269.5215963 ,
       269.92044477, 250.58799879, 197.53619657, 215.639024  ,
       182.86999108, 217.76160106, 197.13290663, 176.20760053,
       131.4425434 , 240.09595868, 178.25102529, 161.94746715,
       258.78422717, 205.49742703, 151.00389432, 151.43657874,
       243.49044901, 159.79172798, 212.22353638, 194.71663188,
       226.90372288, 140.39239335, 212.20896966, 211.90113319,
       211.41572609, 215.94753809, 186.07676945, 139.57171389,
       190.75984252, 234.68600622, 262.87867067, 252.55886029,
       135.92686544, 170.04050973, 168.39668184, 258.04850952,
       193.25911693,  87.43178167, 223.73041806, 251.78

In [56]:
y = []
for i in data_y:
    if i >= 100:
        y.append(1)
    else:
        y.append(0)

In [57]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,


In [58]:
(unique, counts) = np.unique(y, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[  0  22]
 [  1 327]]


In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [60]:
from sklearn.svm import SVC
classifier3 = SVC(kernel = 'linear', random_state = 0)
classifier3.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [62]:
y_pred = classifier3.predict(X_test)

In [63]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 1  4]
 [ 0 65]]


0.9428571428571428

# 4th inhibitor

In [64]:
df = new_dataset[new_dataset['inhibitor']=='Sunitinib']
df.head()

Unnamed: 0,labId,symbol,inhibitor,auc
84,14-00141,C1orf170,Sunitinib,150.335698
179,14-00141,C1orf170,Sunitinib,150.335698
274,14-00141,FLT3,Sunitinib,150.335698
369,14-00141,CA10,Sunitinib,150.335698
464,14-00141,CA10,Sunitinib,150.335698


In [65]:
df_y = df.groupby(['labId'])['auc'].min().reset_index()
df_y

Unnamed: 0,labId,auc
0,11-00261,198.052197
1,12-00069,204.348047
2,12-00127,198.225978
3,12-00362,147.460019
4,12-00371,168.607758
5,13-00098,229.297307
6,13-00106,241.869538
7,13-00118,247.298088
8,13-00149,206.588620
9,13-00157,197.524090


In [66]:
df['new'] = 1
df_x = df.pivot_table(index = ["labId"], columns ="symbol", values = ["new"], fill_value=0)
df_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
symbol,A1CF,A2M,A4GALT,AADAC,AADACL3,AADACL4,AARS2,AASS,ABCA12,ABCA2,...,ZNF98,ZNHIT1,ZNRF4,ZRANB1,ZRSR2,ZSCAN1,ZSCAN2,ZSWIM6,ZXDB,ZZEF1
labId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11-00261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00098,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
13-00106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
df_x.shape

(345, 2072)

In [68]:
X = df_x.iloc[:,0:-1].values

In [69]:
X.shape

(345, 2071)

In [70]:
data_y = df_y.iloc[:,-1].values
data_y

array([198.0521973 , 204.34804672, 198.22597827, 147.46001873,
       168.60775758, 229.2973074 , 241.8695379 , 247.29808836,
       206.58861952, 197.52408966, 201.63222833, 172.61435908,
       155.71409762, 248.64125835, 188.10021226, 224.54992283,
       183.79389561, 226.39417737, 191.84618212, 205.78860699,
       213.65033005, 240.72033775, 151.17061056, 255.99651324,
       246.67568274, 235.94030647, 169.43137399, 220.49187732,
       185.2443103 , 196.95224362, 198.52170616, 184.84744389,
       200.86492821, 241.05374697, 186.54085229, 187.55339679,
       286.19906706, 230.14822642, 188.58035526, 152.23923187,
       200.66357813, 177.69386243, 227.23781162, 218.83366011,
       239.54778099, 130.14223598, 200.02215175, 198.20805762,
       220.29478542, 217.33481219, 186.98056268, 140.19350261,
       210.49883868, 238.96775999, 216.66755763, 227.68891875,
       121.0597174 , 189.55056259, 143.27197476, 267.53891262,
       222.25732336, 182.80941578, 116.78650969, 230.34

In [71]:
y = []
for i in data_y:
    if i >= 100:
        y.append(1)
    else:
        y.append(0)

In [72]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,


In [73]:
(unique, counts) = np.unique(y, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[  0   8]
 [  1 337]]


In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [75]:
from sklearn.svm import SVC
classifier4 = SVC(kernel = 'linear', random_state = 0)
classifier4.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [76]:
y_pred = classifier4.predict(X_test)

In [77]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 0  4]
 [ 0 65]]


0.9420289855072463

# 5th inhibitor

In [78]:
df = new_dataset[new_dataset['inhibitor']=='Dasatinib']
df.head()

Unnamed: 0,labId,symbol,inhibitor,auc
20,14-00141,C1orf170,Dasatinib,181.911892
115,14-00141,C1orf170,Dasatinib,181.911892
210,14-00141,FLT3,Dasatinib,181.911892
305,14-00141,CA10,Dasatinib,181.911892
400,14-00141,CA10,Dasatinib,181.911892


In [79]:
df_y = df.groupby(['labId'])['auc'].min().reset_index()
df_y

Unnamed: 0,labId,auc
0,11-00261,213.583116
1,12-00069,148.474971
2,12-00127,268.998415
3,12-00362,120.172111
4,12-00371,206.772195
5,13-00098,245.842224
6,13-00106,249.608649
7,13-00118,242.830158
8,13-00149,216.692748
9,13-00157,159.381136


In [80]:
df['new'] = 1
df_x = df.pivot_table(index = ["labId"], columns ="symbol", values = ["new"], fill_value=0)
df_x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
symbol,A1CF,A2M,AADAC,AADACL3,AADACL4,AARS2,AASS,ABCA12,ABCA2,ABCA9,...,ZNF98,ZNHIT1,ZNRF4,ZRANB1,ZRSR2,ZSCAN1,ZSCAN2,ZSWIM6,ZXDB,ZZEF1
labId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
11-00261,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-00371,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00098,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
13-00106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13-00157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
df_x.shape

(344, 2047)

In [82]:
X = df_x.iloc[:,0:-1].values

In [83]:
X.shape

(344, 2046)

In [84]:
data_y = df_y.iloc[:,-1].values
data_y

array([213.58311607, 148.47497123, 268.9984151 , 120.17211108,
       206.77219508, 245.84222402, 249.60864932, 242.83015754,
       216.69274816, 159.38113599, 177.81382366, 158.27486807,
       183.96434635, 228.31605974, 155.01020075, 119.30651124,
       240.1312444 , 161.71637233, 150.42177453, 217.93525741,
       118.45470638, 216.1406037 , 251.13217162, 279.40501794,
       257.14317667, 204.515559  , 213.27514713, 147.41798999,
       117.35062885, 197.4236743 , 170.17836779, 172.55979203,
       148.38659466, 229.49317003, 206.24635962, 257.24176677,
       279.63861825, 178.61730296, 112.28165006,  96.85661134,
       232.08210804, 210.7191234 , 133.36119755, 187.46564331,
       234.22948982, 124.17466586, 106.98530908, 101.63121209,
       181.42055027, 209.67458147, 233.40732864, 105.27454005,
       180.81560073, 249.86669862, 215.16927835, 233.34181356,
       137.863735  , 185.58318051, 131.85576205, 221.90499298,
       242.35667316, 267.05177475, 200.90089749, 187.84

In [85]:
y = []
for i in data_y:
    if i >= 100:
        y.append(1)
    else:
        y.append(0)

In [86]:
y

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [87]:
(unique, counts) = np.unique(y, return_counts = True)
frequencies = np.asarray((unique, counts)).T
print(frequencies)

[[  0  41]
 [  1 303]]


In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [89]:
from sklearn.svm import SVC
classifier5 = SVC(kernel = 'linear', random_state = 0)
classifier5.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [90]:
y_pred = classifier5.predict(X_test)

In [91]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 0  5]
 [ 3 61]]


0.8840579710144928