In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB


%matplotlib inline

In [2]:
## Convert to NumpyArray for SVM features
#def toNumpyArray(sequence_bagged):
#    length = max(map(len, sequence_bagged))
 #   return numpy.array([words+['space']*(length-len(words)) for words in sequence_bagged])

In [3]:
trainmain_df = pd.read_csv('./data2/train_data_bagged.csv')
testmain_df = pd.read_csv('./data2/test_data_bagged.csv')
trainmain_df.rename({'0':'sequence','label':'label'},axis=1,inplace=True)
testmain_df.rename({'0':'sequence'},axis=1,inplace=True)
testmain_df.columns

Index(['sequence'], dtype='object')

In [4]:
testmain_df

Unnamed: 0,sequence
0,AATC ATCT TCTC CTCT TCTG CTGT TGTT GTTG TTGT T...
1,TTCT TCTC CTCT TCTA CTAA TAAG AAGA AGAA GAAA A...
2,ACAT CATT ATTT TTTT TTTA TTAC TACG ACGG CGGT G...
3,CACT ACTG CTGA TGAA GAAA AAAA AAAA AAAA AAAA A...
4,GGGA GGAG GAGT AGTG GTGG TGGC GGCA GCAA CAAC A...
...,...
173,TAAA AAAT AATT ATTG TTGC TGCA GCAT CATG ATGT T...
174,GGAT GATG ATGT TGTG GTGC TGCT GCTT CTTA TTAG T...
175,ATTC TTCA TCAG CAGA AGAA GAAA AAAT AATT ATTA T...
176,ACGG CGGT GGTT GTTT TTTT TTTA TTAA TAAG AAGT A...


In [5]:
trainmain_df['sequence'].values

array(['TTAA TAAT AATT ATTT TTTG TTGT TGTC GTCC TCCT CCTT CTTA TTAT TATT ATTT TTTG TTGA TGAT GATT ATTA TTAA TAAG AAGA AGAA GAAG AAGA AGAA GAAT AATA ATAA TAAA AAAT AATC ATCT TCTT CTTA TTAT TATA ATAT TATA ATAT TATA ATAG TAGA AGAT GATT ATTT TTTA TTAC TACA ACAA CAAT AATC ATCT TCTA CTAT TATC ATCG TCGC CGCC GCCT CCTA CTAA TAAA AAAC AACT ACTT CTTC TTCA TCAG CAGC AGCC GCCA CCAC CACT ACTT CTTA TTAA TAAT AATC ATCA TCAA CAAT AATA ATAA TAAT AATC ATCG TCGC CGCG GCGA CGAC GACA ACAA CAAT AATG ATGA TGAT GATT ATTA TTAT TATT ATTT TTTT TTTC TTCT TCTA CTAC TACA ACAA CAAA AAAT AATC ATCA TCAT CATA ATAA TAAA AAAG AAGA AGAT GATA ATAT TATT ATTG TTGG TGGA GGAA GAAC AACT ACTT CTTT TTTA TTAT TATA ATAT TATT ATTT TTTT TTTA TTAT TATT ATTT TTTT TTTT TTTG TTGG TGGA GGAG GAGC AGCT GCTT CTTG TTGA TGAG GAGC AGCT GCTG CTGG TGGA GGAA GAAT AATA ATAG TAGT AGTT GTTG TTGG TGGA GGAA GAAC AACA ACAT CATC ATCT TCTT CTTT TTTA TTAA TAAG AAGA AGAA GAAT AATT ATTT TTTT TTTA TTAA TAAT AATT ATTC TTCG TCGA CGAG GAGC AGCT GCTG CTGA TGAA GA

In [6]:
X = trainmain_df['sequence'].values
y = trainmain_df['label'].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

In [8]:
X_train[0]

'ATGC TGCA GCAA CAAC AACC ACCA CCAG CAGC AGCC GCCA CCAA CAAT AATA ATAC TACA ACAC CACT ACTT CTTT TTTT TTTA TTAG TAGA AGAG GAGG AGGC GGCT GCTG CTGG TGGT GGTA GTAC TACT ACTG CTGA TGAT GATC ATCG TCGA CGAA GAAA AAAA AAAC AACT ACTC CTCG TCGA CGAA GAAA AAAC AACC ACCA CCAC CACT ACTT CTTG TTGA TGAC GACA ACAC CACT ACTC CTCG TCGA CGAA GAAG AAGT AGTA GTAA TAAG AAGT AGTT GTTA TTAA TAAT AATA ATAT TATT ATTC TTCC TCCT CCTC CTCT TCTA CTAA TAAA AAAT AATA ATAG TAGG AGGT GGTG GTGA TGAA GAAA AAAT AATA ATAC TACA ACAA CAAA AAAT AATT ATTC TTCG TCGT CGTA GTAG TAGC AGCA GCAA CAAA AAAC AACT ACTG CTGC TGCG GCGG CGGC GGCC GCCT CCTT CTTA TTAA TAAA AAAT AATC ATCG TCGA CGAA GAAT AATA ATAG TAGC AGCC GCCG CCGG CGGA GGAA GAAG AAGT AGTG GTGG TGGA GGAA GAAC AACA ACAT CATG ATGA TGAA GAAC AACT ACTT CTTC TTCA TCAA CAAT AATG ATGT TGTT GTTA TTAT TATG ATGA TGAA GAAC AACG ACGT CGTC GTCG TCGC CGCT GCTT CTTA TTAC TACC ACCA CCAC CACC ACCT CCTG CTGC TGCA GCAA CAAG AAGC AGCC GCCC CCCT CCTA CTAA TAAA AAAA AAAT AATA ATAT TATC ATCG TCGA

In [9]:
tfid = TfidfVectorizer(ngram_range=(1,3),min_df=0.15,max_features=128)

In [10]:
tfid.fit(X_train)

TfidfVectorizer(max_features=128, min_df=0.15, ngram_range=(1, 3))

In [11]:
X_train_tok = tfid.transform(X_train)
X_train_df = pd.DataFrame(X_train_tok.toarray(),
                          columns=tfid.get_feature_names())
X_train_df

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaat,aaac,aaag,aaat,aaat aata,aaat aatt,aaca,aacg,...,ttgc,ttgg,ttgt,ttta,ttta ttaa,tttc,tttg,tttt,tttt ttta,tttt tttt
0,0.138188,0.046838,0.085271,0.146360,0.037448,0.202016,0.182372,0.045773,0.037662,0.046087,...,0.040665,0.041820,0.000000,0.072875,0.000000,0.036167,0.071326,0.070572,0.048781,0.000000
1,0.137891,0.037390,0.102106,0.058418,0.059789,0.107511,0.072792,0.036540,0.030065,0.036790,...,0.032462,0.066768,0.000000,0.116350,0.000000,0.028872,0.085408,0.450691,0.077883,0.464909
2,0.069059,0.000000,0.085228,0.073142,0.000000,0.179477,0.091139,0.091500,0.050190,0.000000,...,0.000000,0.055731,0.051399,0.024279,0.000000,0.096397,0.071290,0.141072,0.000000,0.129353
3,0.000000,0.000000,0.000000,0.111185,0.075862,0.068207,0.000000,0.046363,0.038147,0.140043,...,0.164755,0.042359,0.312527,0.147629,0.098859,0.183167,0.108368,0.214445,0.000000,0.000000
4,0.367966,0.307002,0.139729,0.089937,0.061364,0.193101,0.074711,0.150012,0.030857,0.000000,...,0.066635,0.034264,0.063200,0.179124,0.119950,0.059265,0.087658,0.115642,0.039968,0.079527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16945,0.034975,0.000000,0.000000,0.037043,0.037912,0.068172,0.092315,0.000000,0.076256,0.000000,...,0.000000,0.000000,0.078092,0.184443,0.049405,0.036615,0.144418,0.321505,0.049385,0.245665
16946,0.026908,0.000000,0.000000,0.085499,0.058336,0.052450,0.035512,0.000000,0.000000,0.071794,...,0.000000,0.097719,0.090123,0.227048,0.190052,0.084511,0.166667,0.357292,0.113986,0.189007
16947,0.130272,0.044155,0.080387,0.034494,0.035303,0.158703,0.000000,0.129454,0.106514,0.000000,...,0.000000,0.039424,0.072719,0.068701,0.000000,0.102286,0.033620,0.166323,0.091974,0.091504
16948,0.128650,0.058140,0.052924,0.000000,0.046485,0.083588,0.000000,0.000000,0.000000,0.000000,...,0.151431,0.103822,0.000000,0.000000,0.000000,0.000000,0.088538,0.000000,0.000000,0.000000


In [12]:
X_test_tok = tfid.transform(X_test)
X_test_df = pd.DataFrame(X_test_tok.toarray(),
                          columns=tfid.get_feature_names())
X_test_df

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaat,aaac,aaag,aaat,aaat aata,aaat aatt,aaca,aacg,...,ttgc,ttgg,ttgt,ttta,ttta ttaa,tttc,tttg,tttt,tttt ttta,tttt tttt
0,0.218874,0.127176,0.077177,0.099350,0.000000,0.213314,0.165061,0.082857,0.136348,0.041712,...,0.036805,0.037850,0.104723,0.131916,0.000000,0.065468,0.096834,0.159683,0.044151,0.043925
1,0.000000,0.000000,0.000000,0.000000,0.122214,0.082411,0.000000,0.000000,0.030728,0.000000,...,0.232244,0.068240,0.031468,0.059458,0.000000,0.000000,0.087291,0.028789,0.000000,0.000000
2,0.309852,0.190950,0.104291,0.089502,0.000000,0.247074,0.223049,0.074644,0.061416,0.075155,...,0.066313,0.068196,0.062895,0.089130,0.039790,0.058979,0.087235,0.143855,0.079549,0.039571
3,0.145915,0.049457,0.000000,0.038636,0.237253,0.142208,0.144427,0.000000,0.000000,0.145992,...,0.042938,0.044158,0.081451,0.153900,0.103059,0.038190,0.000000,0.111777,0.103018,0.051246
4,0.116122,0.104957,0.000000,0.081993,0.125874,0.113172,0.000000,0.051286,0.168789,0.154911,...,0.091123,0.000000,0.129640,0.163302,0.164032,0.040523,0.079916,0.158141,0.054656,0.108753
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5645,0.193258,0.043669,0.039751,0.170572,0.069829,0.188348,0.042508,0.085353,0.210683,0.171875,...,0.037913,0.116971,0.107878,0.101917,0.045499,0.000000,0.133001,0.131595,0.000000,0.090497
5646,0.059349,0.040232,0.000000,0.094287,0.032166,0.057841,0.039162,0.039317,0.032350,0.118760,...,0.034929,0.107763,0.099386,0.093895,0.000000,0.093198,0.153165,0.242473,0.083802,0.083374
5647,0.213485,0.124045,0.112916,0.032301,0.198354,0.208061,0.120748,0.040408,0.066495,0.000000,...,0.035898,0.000000,0.034048,0.096501,0.043081,0.031928,0.062966,0.124601,0.043064,0.042844
5648,0.178973,0.000000,0.088351,0.113734,0.116402,0.069771,0.094479,0.000000,0.156089,0.047751,...,0.126399,0.043330,0.079923,0.151014,0.101126,0.074947,0.073902,0.219362,0.101086,0.150855


In [13]:
testmain_df_tok = tfid.transform(testmain_df['sequence'].values)
testmain_df2 = pd.DataFrame(testmain_df_tok.toarray(),
                          columns=tfid.get_feature_names())
testmain_df2

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaat,aaac,aaag,aaat,aaat aata,aaat aatt,aaca,aacg,...,ttgc,ttgg,ttgt,ttta,ttta ttaa,tttc,tttg,tttt,tttt ttta,tttt tttt
0,0.157073,0.106478,0.032308,0.027727,0.085132,0.178596,0.103648,0.104058,0.057079,0.034923,...,0.030815,0.000000,0.116906,0.165669,0.110940,0.082220,0.054049,0.106955,0.110896,0.000000
1,0.042752,0.000000,0.052761,0.045280,0.139026,0.124997,0.000000,0.113288,0.046606,0.057032,...,0.050322,0.000000,0.000000,0.045091,0.000000,0.044757,0.044133,0.174665,0.060367,0.120117
2,0.102506,0.046325,0.042169,0.180946,0.111114,0.166502,0.090187,0.135816,0.111748,0.000000,...,0.160876,0.000000,0.152585,0.180192,0.048266,0.178855,0.070545,0.104698,0.096494,0.000000
3,0.326406,0.316095,0.028773,0.074080,0.101091,0.159057,0.061539,0.154455,0.025417,0.000000,...,0.000000,0.028223,0.052058,0.049181,0.032934,0.024408,0.072204,0.142881,0.000000,0.131012
4,0.163905,0.111109,0.050570,0.000000,0.044417,0.039935,0.054078,0.000000,0.044671,0.218655,...,0.000000,0.049602,0.000000,0.043219,0.057882,0.214490,0.000000,0.083705,0.057860,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.164302,0.000000,0.101385,0.000000,0.133575,0.120096,0.000000,0.054423,0.000000,0.273980,...,0.096698,0.049722,0.000000,0.000000,0.000000,0.000000,0.042402,0.000000,0.000000,0.000000
174,0.158234,0.085812,0.078113,0.067037,0.068609,0.123371,0.083531,0.041931,0.034500,0.042218,...,0.074502,0.038309,0.070662,0.033379,0.000000,0.132524,0.130677,0.096971,0.000000,0.000000
175,0.304167,0.149957,0.170629,0.117147,0.000000,0.215592,0.072986,0.146549,0.060290,0.073776,...,0.097644,0.033473,0.000000,0.174989,0.039060,0.086845,0.114180,0.141215,0.039045,0.116536
176,0.000000,0.000000,0.000000,0.030058,0.000000,0.082977,0.037454,0.075205,0.000000,0.037860,...,0.033406,0.034354,0.126736,0.179599,0.080179,0.059422,0.087891,0.318859,0.120221,0.199344


In [14]:
svc = SVC(kernel='poly', C='10')
svc.fit(X_train_tok,y_train)
cross_val_score(svc,X_train_tok,y_train,cv=5).mean()

0.7397050147492625

In [15]:
y_pred = svc.predict(X_test_tok)

In [16]:
accuracy_score(y_test, y_pred)

0.7417699115044247

In [44]:
y_pred_test = svc.predict(testmain_df_tok)

In [45]:
results_SVM = pd.DataFrame(y_pred_test)

In [48]:
original_SVM = pd.read_csv('./data2/test_data_bagged.csv')

In [49]:
original_SVM['predictions'] = y_pred_test

In [50]:
original_SVM.to_csv('./data/test2.csv', index=False)

In [52]:
original_SVM['predictions'].value_counts()

1    93
0    85
Name: predictions, dtype: int64