In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB


%matplotlib inline

In [2]:
## Convert to NumpyArray for SVM features
#def toNumpyArray(sequence_bagged):
#    length = max(map(len, sequence_bagged))
 #   return numpy.array([words+['space']*(length-len(words)) for words in sequence_bagged])

In [3]:
## Import temp data

df_1 = pd.read_csv('./data/promo.csv', header=None)
df_2 = pd.read_csv('./data/non_promo.csv', header=None)

In [4]:
testmain_df = pd.read_csv('./data/test_bagged.csv', header=None)

In [5]:
df_1['label'] = 1
df_2['label'] = 0

In [6]:
df = pd.concat([df_1,df_2])

In [7]:
df = df.reset_index(drop=True)

In [8]:
df.rename({0:'seq','label':'label'},axis=1, inplace=True)

In [9]:
df

Unnamed: 0,seq,label
0,TTAA TTTG TCCT TATT TGAT TAAG AAGA ATAA ATCT T...,1
1,ATAG CTCA AATT GCTT TATT AGTA TTAG AATC AGCT G...,1
2,AAGC TTCC CTTT AATG TGCT CCTT GTGA ATAC AGCA T...,1
3,TATG TAGA ATCT GTAC AAGT ATCT GTGT TTGG ACAA T...,1
4,ACAT ATTA CTGC ATAC AGGT CTCA AATT ATAA AATG A...,1
...,...,...
22595,TGGT AAAA AATT GTAC ACCT AACT AGTG CCTT CATG T...,0
22596,AGTG CAAC TGGA GCCG TGCC GTGA CCCA CAGA GATC G...,0
22597,GCAT GGAT TTCA TATT ATCT TAAT CGAC TTGC TTTT A...,0
22598,GTGA CCAG GTTT TGCT CTAA TGCG AAGT ACGG ATTG G...,0


In [10]:
cvec = TfidfVectorizer(ngram_range=(1, 4))

In [12]:
y.value_counts(normalize=True)

1    0.5
0    0.5
Name: label, dtype: float64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [14]:
X_train.shape

(18080,)

In [15]:
cvec.fit(X_train)

TfidfVectorizer(ngram_range=(1, 4))

In [16]:
X_train = cvec.transform(X_train)
X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=cvec.get_feature_names())
X_train_df

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaaa aaaa,aaaa aaaa aaaa aaaa,aaaa aaaa aaaa aaac,aaaa aaaa aaaa aaag,aaaa aaaa aaaa aaat,aaaa aaaa aaaa aaca,aaaa aaaa aaaa aacc,aaaa aaaa aaaa aacg,...,tttt tttt tttt ttag,tttt tttt tttt ttat,tttt tttt tttt ttga,tttt tttt tttt ttgc,tttt tttt tttt ttgg,tttt tttt tttt ttgt,tttt tttt tttt ttta,tttt tttt tttt tttc,tttt tttt tttt tttg,tttt tttt tttt tttt
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.010206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.009997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.010092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,0.010163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18076,0.010196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18077,0.010177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18078,0.010019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_test = cvec.transform(X_test)
X_test_df = pd.DataFrame(X_test.toarray(),
                         columns=cvec.get_feature_names())
X_test_df

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaaa aaaa,aaaa aaaa aaaa aaaa,aaaa aaaa aaaa aaac,aaaa aaaa aaaa aaag,aaaa aaaa aaaa aaat,aaaa aaaa aaaa aaca,aaaa aaaa aaaa aacc,aaaa aaaa aaaa aacg,...,tttt tttt tttt ttag,tttt tttt tttt ttat,tttt tttt tttt ttga,tttt tttt tttt ttgc,tttt tttt tttt ttgg,tttt tttt tttt ttgt,tttt tttt tttt ttta,tttt tttt tttt tttc,tttt tttt tttt tttg,tttt tttt tttt tttt
0,0.053166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.071775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.018158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4515,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4516,0.017637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4517,0.036302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4518,0.017582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
testmain = cvec.transform(testmain_df[0])
testmain_df2 = pd.DataFrame(testmain.toarray(),
                          columns=cvec.get_feature_names())
testmain_df2

Unnamed: 0,aaaa,aaaa aaaa,aaaa aaaa aaaa,aaaa aaaa aaaa aaaa,aaaa aaaa aaaa aaac,aaaa aaaa aaaa aaag,aaaa aaaa aaaa aaat,aaaa aaaa aaaa aaca,aaaa aaaa aaaa aacc,aaaa aaaa aaaa aacg,...,tttt tttt tttt ttag,tttt tttt tttt ttat,tttt tttt tttt ttga,tttt tttt tttt ttgc,tttt tttt tttt ttgg,tttt tttt tttt ttgt,tttt tttt tttt ttta,tttt tttt tttt tttc,tttt tttt tttt tttg,tttt tttt tttt tttt
0,0.034546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.018162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.052075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.037838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174,0.036348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
175,0.017807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
testmain_df

Unnamed: 0,0
0,AATC TCTG TTGT TGTT ATGC AAAA AAGG AATA ATGT A...
1,TTCT CTAA GAAA GTCG ATGC TAAG CGGA TGCT AAAC G...
2,ACAT TTTA CGGT CTTG CGTG TGTA TTTA TTGT TCGT A...
3,CACT GAAA AAAA AAGA AAGG CTTA TTTA CTAT TAAC A...
4,GGGA GTGG CAAC ATGG GCTC ACAA GTCT AGAT CGAC T...
...,...
173,TAAA TTGC ATGT AAAA TCAT TAAT AACG ATTA TATT C...
174,GGAT GTGC TTAG TGTA ATTT GCTT ATAA AAAC TACT T...
175,ATTC AGAA ATTA CATG TTTC TGAA AACT CTTT CCGA T...
176,ACGG TTTT AAGT GCCC AAAC TTAG GGTG TAGC GCCC T...


In [20]:
svc = SVC()
svc.fit(X_train,y_train)
cross_val_score(svc,X_train,y_train,cv=5).mean()

KeyboardInterrupt: 

In [None]:
y_pred = svc.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
y_pred_test = svc.predict(testmain)

In [None]:
results = pd.DataFrame(y_pred)

In [None]:
#lr = LogisticRegression(penalty='l1', solver='liblinear')
#lr.fit(X_train, y_train)

#cross_val_score(lr,X_train,y_train,cv=5).mean()

In [None]:
#mnb4 = BernoulliNB(alpha=0.75)

original = pd.read_csv('./data/test_bagged.csv', header=None)

In [None]:
original['predictions'] = y_pred_test

In [None]:
original.to_csv('./data/test2.csv', index=False)

In [None]:
original['predictions'].value_counts()