In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


%matplotlib inline

In [198]:
## Convert to NumpyArray for SVM features
#def toNumpyArray(sequence_bagged):
#    length = max(map(len, sequence_bagged))
 #   return numpy.array([words+['space']*(length-len(words)) for words in sequence_bagged])

In [199]:
trainmain_df = pd.read_csv('./data3/train_data_bagged.csv')
testmain_df = pd.read_csv('./data3/test_data_bagged.csv')
trainmain_df.rename({'0':'sequence','label':'label'},axis=1,inplace=True)
testmain_df.rename({'0':'sequence'},axis=1,inplace=True)
testmain_df.columns

Index(['sequence'], dtype='object')

In [200]:
testmain_df

Unnamed: 0,sequence
0,AATC ATCT TCTC CTCT TCTG CTGT TGTT GTTG TTGT T...
1,TTCT TCTC CTCT TCTA CTAA TAAG AAGA AGAA GAAA A...
2,ACAT CATT ATTT TTTT TTTA TTAC TACG ACGG CGGT G...
3,CACT ACTG CTGA TGAA GAAA AAAA AAAA AAAA AAAA A...
4,GGGA GGAG GAGT AGTG GTGG TGGC GGCA GCAA CAAC A...
...,...
173,TAAA AAAT AATT ATTG TTGC TGCA GCAT CATG ATGT T...
174,GGAT GATG ATGT TGTG GTGC TGCT GCTT CTTA TTAG T...
175,ATTC TTCA TCAG CAGA AGAA GAAA AAAT AATT ATTA T...
176,ACGG CGGT GGTT GTTT TTTT TTTA TTAA TAAG AAGT A...


In [201]:
trainmain_df['sequence'].values

array(['TTAA TAAT AATT ATTT TTTG TTGT TGTC GTCC TCCT CCTT CTTA TTAT TATT ATTT TTTG TTGA TGAT GATT ATTA TTAA TAAG AAGA AGAA GAAG AAGA AGAA GAAT AATA ATAA TAAA AAAT AATC ATCT TCTT CTTA TTAT TATA ATAT TATA ATAT TATA ATAG TAGA AGAT GATT ATTT TTTA TTAC TACA ACAA CAAT AATC ATCT TCTA CTAT TATC ATCG TCGC CGCC GCCT CCTA CTAA TAAA AAAC AACT ACTT CTTC TTCA TCAG CAGC AGCC GCCA CCAC CACT ACTT CTTA TTAA TAAT AATC ATCA TCAA CAAT AATA ATAA TAAT AATC ATCG TCGC CGCG GCGA CGAC GACA ACAA CAAT AATG ATGA TGAT GATT ATTA TTAT TATT ATTT TTTT TTTC TTCT TCTA CTAC TACA ACAA CAAA AAAT AATC ATCA TCAT CATA ATAA TAAA AAAG AAGA AGAT GATA ATAT TATT ATTG TTGG TGGA GGAA GAAC AACT ACTT CTTT TTTA TTAT TATA ATAT TATT ATTT TTTT TTTA TTAT TATT ATTT TTTT TTTT TTTG TTGG TGGA GGAG GAGC AGCT GCTT CTTG TTGA TGAG GAGC AGCT GCTG CTGG TGGA GGAA GAAT AATA ATAG TAGT AGTT GTTG TTGG TGGA GGAA GAAC AACA ACAT CATC ATCT TCTT CTTT TTTA TTAA TAAG AAGA AGAA GAAT AATT ATTT TTTT TTTA TTAA TAAT AATT ATTC TTCG TCGA CGAG GAGC AGCT GCTG CTGA TGAA GA

In [202]:
X = trainmain_df['sequence'].values
y = trainmain_df['label'].values

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

In [204]:
X_train[0]

'GGTG GTGG TGGA GGAG GAGG AGGA GGAA GAAA AAAT AATC ATCC TCCG CCGC CGCA GCAA CAAG AAGA AGAG GAGA AGAG GAGA AGAG GAGC AGCG GCGC CGCT GCTA CTAG TAGA AGAG GAGA AGAG GAGC AGCG GCGG CGGA GGAT GATT ATTG TTGG TGGA GGAA GAAC AACT ACTC CTCG TCGG CGGT GGTT GTTT TTTC TTCG TCGG CGGC GGCC GCCA CCAA CAAA AAAG AAGC AGCC GCCA CCAA CAAA AAAG AAGC AGCA GCAG CAGA AGAG GAGC AGCC GCCA CCAG CAGC AGCA GCAG CAGC AGCC GCCA CCAG CAGT AGTT GTTT TTTT TTTT TTTG TTGC TGCT GCTT CTTT TTTT TTTT TTTA TTAG TAGT AGTC GTCG TCGA CGAT GATT ATTT TTTG TTGT TGTA GTAT TATC ATCT TCTA CTAC TACC ACCT CCTT CTTT TTTG TTGG TGGT GGTG GTGC TGCG GCGG CGGA GGAC GACG ACGG CGGT GGTT GTTG TTGG TGGT GGTC GTCG TCGG CGGA GGAA GAAT AATA ATAA TAAA AAAC AACG ACGC CGCG GCGT CGTT GTTT TTTC TTCG TCGC CGCA GCAG CAGC AGCG GCGG CGGA GGAA GAAC AACT ACTC CTCC TCCA CCAA CAAG AAGA AGAA GAAG AAGA AGAG GAGC AGCA GCAG CAGA AGAA GAAA AAAA AAAC AACC ACCA CCAG CAGT AGTC GTCT TCTT CTTT TTTA TTAA TAAT AATT ATTG TTGT TGTT GTTC TTCC TCCA CCAT CATT ATTT TTTC TTCA TCAA

In [205]:
tfid = TfidfVectorizer(ngram_range=(1,3),min_df=0.20, max_df=0.70,max_features=200)

In [206]:
tfid.fit(X_train)

TfidfVectorizer(max_df=0.7, max_features=200, min_df=0.2, ngram_range=(1, 3))

In [207]:
X_train_tok = tfid.transform(X_train)
X_train_df = pd.DataFrame(X_train_tok.toarray(),
                          columns=tfid.get_feature_names())
X_train_df

Unnamed: 0,aaaa aaaa,aaaa aaaa aaaa,aaaa aaac,aaaa aaag,aaac aaca,aaat aata,aaat aatg,aaat aatt,aaca acaa,aacc,...,ttcc,ttgt tgtt,ttta ttaa,ttta ttat,tttg ttgt,tttt ttta,tttt tttc,tttt tttg,tttt tttt,tttt tttt tttt
0,0.000000,0.0,0.048682,0.100940,0.000000,0.043860,0.000000,0.043863,0.000000,0.045967,...,0.042890,0.053150,0.046891,0.000000,0.051794,0.046808,0.000000,0.047928,0.093020,0.000000
1,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.051882,0.090009,0.157359,0.094326,...,0.000000,0.109066,0.144331,0.203661,0.053142,0.096051,0.000000,0.049175,0.143159,0.071531
2,0.144780,0.0,0.052307,0.054228,0.053199,0.141378,0.108664,0.141388,0.109859,0.000000,...,0.000000,0.057108,0.151145,0.106638,0.055651,0.050293,0.052265,0.000000,0.049973,0.000000
3,0.091230,0.0,0.098880,0.051256,0.100567,0.089087,0.051354,0.222733,0.051919,0.000000,...,0.000000,0.000000,0.047621,0.000000,0.052601,0.000000,0.000000,0.146023,0.141703,0.141606
4,0.080765,0.0,0.131306,0.045376,0.089031,0.000000,0.000000,0.039436,0.000000,0.000000,...,0.038561,0.095572,0.252947,0.133847,0.186267,0.210419,0.087468,0.215454,0.292710,0.125361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15815,0.000000,0.0,0.095707,0.000000,0.048670,0.086228,0.000000,0.000000,0.100507,0.000000,...,0.000000,0.000000,0.046093,0.000000,0.050913,0.046012,0.047816,0.094225,0.228593,0.137062
15816,0.000000,0.0,0.000000,0.054758,0.000000,0.047587,0.000000,0.000000,0.000000,0.049872,...,0.046533,0.057666,0.254370,0.053840,0.112389,0.152354,0.105552,0.155999,0.252306,0.000000
15817,0.049825,0.0,0.000000,0.055986,0.000000,0.000000,0.056094,0.145973,0.056711,0.050991,...,0.000000,0.058959,0.000000,0.055048,0.000000,0.103848,0.053960,0.000000,0.103187,0.000000
15818,0.048017,0.0,0.000000,0.053955,0.000000,0.000000,0.054058,0.000000,0.000000,0.049141,...,0.137554,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [208]:
X_test_tok = tfid.transform(X_test)
X_test_df = pd.DataFrame(X_test_tok.toarray(),
                          columns=tfid.get_feature_names())
X_test_df

Unnamed: 0,aaaa aaaa,aaaa aaaa aaaa,aaaa aaac,aaaa aaag,aaac aaca,aaat aata,aaat aatg,aaat aatt,aaca acaa,aacc,...,ttcc,ttgt tgtt,ttta ttaa,ttta ttat,tttg ttgt,tttt ttta,tttt tttc,tttt tttg,tttt tttt,tttt tttt tttt
0,0.141670,0.000000,0.102367,0.000000,0.104113,0.184456,0.053165,0.092234,0.053750,0.000000,...,0.045093,0.000000,0.000000,0.104348,0.054455,0.049213,0.051143,0.100781,0.048900,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.077181,0.000000,0.000000,0.035080,...,0.000000,0.040562,0.000000,0.000000,0.039527,0.000000,0.000000,0.036577,0.000000,0.000000
2,0.220339,0.066145,0.143289,0.000000,0.048578,0.258195,0.000000,0.086071,0.050158,0.000000,...,0.042080,0.104294,0.046005,0.048688,0.050817,0.091849,0.047725,0.047023,0.045632,0.000000
3,0.049758,0.000000,0.000000,0.167733,0.000000,0.145767,0.000000,0.000000,0.000000,0.000000,...,0.095027,0.058880,0.103891,0.000000,0.000000,0.103709,0.000000,0.000000,0.051524,0.000000
4,0.094430,0.070869,0.051174,0.000000,0.000000,0.000000,0.000000,0.046109,0.161220,0.000000,...,0.045085,0.111742,0.147872,0.052164,0.054445,0.049204,0.000000,0.050381,0.097781,0.073286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6775,0.000000,0.000000,0.000000,0.056916,0.000000,0.098925,0.000000,0.049466,0.000000,0.103676,...,0.241839,0.059939,0.000000,0.111925,0.058410,0.000000,0.164569,0.000000,0.000000,0.000000
6776,0.050613,0.000000,0.054857,0.056872,0.055793,0.098848,0.113962,0.000000,0.057608,0.051798,...,0.000000,0.000000,0.000000,0.111837,0.000000,0.000000,0.000000,0.054007,0.000000,0.000000
6777,0.257997,0.258166,0.000000,0.048317,0.000000,0.083978,0.000000,0.041992,0.000000,0.044006,...,0.000000,0.101765,0.044890,0.000000,0.049585,0.044811,0.093136,0.091767,0.044526,0.000000
6778,0.051248,0.000000,0.111091,0.057586,0.112986,0.000000,0.000000,0.050048,0.058331,0.000000,...,0.097873,0.000000,0.053501,0.000000,0.000000,0.000000,0.000000,0.109370,0.000000,0.000000


In [209]:
testmain_df_tok = tfid.transform(testmain_df['sequence'].values)
testmain_df2 = pd.DataFrame(testmain_df_tok.toarray(),
                          columns=tfid.get_feature_names())
testmain_df2

Unnamed: 0,aaaa aaaa,aaaa aaaa aaaa,aaaa aaac,aaaa aaag,aaac aaca,aaat aata,aaat aatg,aaat aatt,aaca acaa,aacc,...,ttcc,ttgt tgtt,ttta ttaa,ttta ttat,tttg ttgt,tttt ttta,tttt tttc,tttt tttg,tttt tttt,tttt tttt tttt
0,0.136393,0.068241,0.000000,0.102173,0.000000,0.133189,0.051184,0.133198,0.051748,0.000000,...,0.043414,0.161398,0.142390,0.050230,0.000000,0.142139,0.000000,0.048513,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055263,0.095874,0.055871,0.150708,...,0.046873,0.000000,0.000000,0.054233,0.000000,0.051155,0.053161,0.000000,0.101659,0.000000
2,0.052312,0.000000,0.000000,0.058781,0.057666,0.102167,0.000000,0.153260,0.119084,0.000000,...,0.049953,0.185709,0.054612,0.115593,0.120647,0.109033,0.056654,0.000000,0.000000,0.000000
3,0.363331,0.381749,0.078760,0.040826,0.000000,0.070959,0.000000,0.177410,0.041354,0.074367,...,0.034694,0.042994,0.037931,0.000000,0.083795,0.000000,0.039349,0.038770,0.150491,0.169187
4,0.089602,0.000000,0.000000,0.050341,0.000000,0.043749,0.000000,0.000000,0.000000,0.000000,...,0.128341,0.000000,0.046771,0.000000,0.000000,0.046689,0.048520,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.000000,0.000000,0.000000,0.100450,0.000000,0.000000,0.050321,0.043650,0.000000,0.045744,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
174,0.102139,0.000000,0.055352,0.000000,0.000000,0.099739,0.000000,0.049873,0.000000,0.052265,...,0.146298,0.060432,0.000000,0.056423,0.117781,0.000000,0.110616,0.054494,0.000000,0.000000
175,0.171376,0.128616,0.092873,0.000000,0.047229,0.083675,0.096469,0.167361,0.048765,0.000000,...,0.040911,0.000000,0.044728,0.189341,0.000000,0.044649,0.000000,0.045717,0.133094,0.066501
176,0.000000,0.000000,0.000000,0.000000,0.000000,0.041630,0.000000,0.083266,0.000000,0.000000,...,0.000000,0.050448,0.089012,0.188404,0.098321,0.133284,0.046170,0.090982,0.220725,0.198517


In [210]:
ada = AdaBoostClassifier()
ada.fit(X_train_tok,y_train)
ada.score(X_test_tok,y_test)

0.7502949852507375

In [None]:
ada_param = {
    'n_estimators':[300],
    'base_estimator__max_depth': [1]
    
}

gs = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),),
                 param_grid=ada_param, cv=7, verbose=1)

gs.fit(X_train_tok,y_train)
best_model = gs.best_estimator_
print(best_model.score(X_train_tok,y_train))

In [None]:
print(best_model.score(X_test_tok,y_test))

In [None]:
gs.best_estimator_

In [None]:
#results_KNN = pd.DataFrame(y_pred_test)

In [None]:
#original_KNN = pd.read_csv('./data2/test_data_bagged.csv')

In [None]:
#original_KNN['predictions'] = y_pred_test

In [None]:
#original_KNN.to_csv('./data/test2_knn.csv', index=False)

In [None]:
#original_KNN['predictions'].value_counts()