In [1]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection

In [5]:
df = pd.read_csv('enhancer_set.csv')

In [7]:
def reverse_kmer(mer):
    reverse = []
    for letter in mer:
        if letter == 'A':
            reverse.append('T')
        elif letter == 'T':
            reverse.append('A')
        elif letter == 'G':
            reverse.append('C')
        else:
            reverse.append('G')
    x = ''.join(reverse)
    return x[::-1] 

In [8]:
def get_kmers(k):
    base = ['A', 'T', 'C', 'G']
    w = [''.join(p) for p in itertools.product(base, repeat=k)]
    kmers = []
    for mer in w:
        if reverse_kmer(mer) not in kmers:
            kmers.append(mer)
    return kmers

In [18]:
def count_kmers_v2(k, sequence):
    number_of_kmer = len(sequence) - k + 1
    kes = get_kmers(k)
    count = {k:0 for k in kes}
    for i in range(number_of_kmer):
        kmer = sequence[i:i+k]
        if kmer in kes:
            count[kmer] += 1/number_of_kmer
        elif reverse_kmer(kmer) in kes:
            count[reverse_kmer(kmer)] += 1/number_of_kmer
    return count.values

In [9]:
def count_kmers(k, sequence):
    kmers = get_kmers(k)
    count = []
    for kmer in kmers:
        if kmer == reverse_kmer(kmer):
            count.append(sequence.count(kmer))
        else:
            count.append(sequence.count(kmer) + sequence.count(reverse_kmer(kmer)))
    return count

In [10]:
get_kmers(2)

['AA', 'AT', 'AC', 'AG', 'TA', 'TC', 'TG', 'CC', 'CG', 'GC']

# 2-mery

In [10]:
count_df = pd.DataFrame(columns=get_kmers(2))
for i in range(len(df)):
    if i % 10000 == 0:
        print(i)
    count_df.loc[i] = count_kmers(2, df.Sequence[i])

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000


In [43]:
data = pd.concat([df, count_df], axis=1)

In [44]:
data['length'] = data.cor2 - data.cor1

In [45]:
data['chr'] = [napis[3:] for napis in data.chr]

In [46]:
data.drop(['cor1', 'cor2', 'Sequence'], axis=1, inplace=True)

In [47]:
data['chr'] = data['chr'].astype(int)

In [48]:
y = data['Enhancer']

In [49]:
data.drop('Enhancer', axis=1, inplace=True)

In [50]:
X_train, X_test, y_train, y_test = train_test_split(
     data, y, test_size=0.30, random_state=42)

***

In [18]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
pred = neigh.predict(X_train)
pred_t = neigh.predict(X_test)

In [20]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.7997494800529401
0.7751363490535772
0.708575283007801
0.7849273121685086


In [124]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6249862137421418
0.5301847124192822
0.49017702186740714
0.6019928698963346


***

In [135]:
mlp = MLPClassifier(random_state=1, max_iter=300, learning_rate='adaptive', batch_size=100).fit(X_train, y_train)

In [136]:
pred = mlp.predict(X_train)
pred_t = mlp.predict(X_test)

In [137]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6983716203441104
0.6879302029900478
0.4601736172209514
0.6596478267001721


In [138]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6970883423403551
0.6768387296989758
0.45421728566469977
0.6556637359119821


***

In [176]:
dt = DecisionTreeClassifier(random_state=0, max_depth=8, criterion="entropy", splitter='best')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [177]:
pred = dt.predict(X_train)
pred_t = dt.predict(X_test)

In [178]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6795353564000757
0.6902153175252113
0.3713414276497155
0.6294323415016974


In [179]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6744237344215286
0.6660273622298939
0.36161055189170427
0.6210696495450837


***

In [190]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=20)
rf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [191]:
pred = rf.predict(X_train)
pred_t = rf.predict(X_test)

In [192]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6843094157685763
0.6908245024554148
0.39192914540442253
0.6367772236146128


In [193]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7324638800044116
0.7824363286881307
0.45213467545990976
0.6846503318295752


***

In [51]:
xgb = XGBClassifier()

In [None]:
xgb.fit(X_train, y_train)

In [None]:
pred = xgb.predict(X_train)
pred_t = xgb.predict(X_test)

In [54]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.7605171109850634
0.7746188055908514
0.5721156666080122
0.7298887322471245


In [55]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7083379287526194
0.6813916010996303
0.49899340506768486
0.672631679435457


# 3-mery

In [56]:
count_df = pd.DataFrame(columns=get_kmers(3))
for i in range(len(df)):
    if i % 10000 == 0:
        print(i)
    count_df.loc[i] = count_kmers(3, df.Sequence[i])

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000


In [58]:
data = pd.concat([df, count_df], axis=1)
data['length'] = data.cor2 - data.cor1
data['chr'] = [napis[3:] for napis in data.chr]
data.drop(['cor1', 'cor2', 'Sequence'], axis=1, inplace=True)
data['chr'] = data['chr'].astype(int)

In [59]:
y = data['Enhancer']
data.drop('Enhancer', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(
     data, y, test_size=0.30, random_state=42)

***

In [60]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
pred = neigh.predict(X_train)
pred_t = neigh.predict(X_test)

In [62]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.8013802231045566
0.7782247682801235
0.7091618276731773
0.786388300132753


In [63]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6233594353148781
0.527950776618894
0.48844151336341546
0.600347546234834


***

In [64]:
mlp = MLPClassifier(random_state=1, max_iter=300, learning_rate='adaptive', batch_size=100).fit(X_train, y_train)

In [65]:
pred = mlp.predict(X_train)
pred_t = mlp.predict(X_test)

In [66]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.7036656267725468
0.6836896762370189
0.4923455921168397
0.6693113781745581


In [67]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7027131355464873
0.6735984666986105
0.4879555709822978
0.6660836264095956


***

In [68]:
dt = DecisionTreeClassifier(random_state=0, max_depth=8, criterion="entropy", splitter='best')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [69]:
pred = dt.predict(X_train)
pred_t = dt.predict(X_test)

In [70]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6826668557383249
0.7226730603713267
0.34474162707490175
0.6277304303683894


In [71]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.676105657880225
0.69390137146192
0.33016313779937523
0.6171009623955481


***

In [72]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=20)
rf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [73]:
pred = rf.predict(X_train)
pred_t = rf.predict(X_test)

In [74]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6894734354320288
0.7099892588614393
0.38770602381371344
0.6404151779203943


In [75]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7554869306275505
0.842933234237582
0.4724054147865325
0.707203942356446


***

In [None]:
xgb.fit(X_train, y_train)

In [None]:
pred = xgb.predict(X_train)
pred_t = xgb.predict(X_test)

In [78]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.7912767063717149
0.8065659391904495
0.6340547832717461
0.7657171751136864


In [79]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7220414690636374
0.6894497020679986
0.5461992363762582
0.6920494420915275


# 4-mery

In [None]:
count_df = pd.DataFrame(columns=get_kmers(4))
for i in range(len(df)):
    if i % 10000 == 0:
        print(i)
    count_df.loc[i] = count_kmers(4, df.Sequence[i])

In [29]:
count_df

Unnamed: 0,AAAA,AAAT,AAAC,AAAG,AATA,AATT,AATC,AATG,AACA,AACT,...,GAAC,GATC,GACC,GAGC,GTAC,GTCC,GTGC,GCCC,GCGC,GGCC
0,35,42,20,22,30,23,11,15,13,14,...,7,6,8,17,2,10,15,19,3,14
1,11,8,3,6,11,3,11,5,3,4,...,3,2,3,0,1,2,3,1,0,1
2,4,4,6,9,3,3,1,7,16,7,...,17,3,9,13,2,13,10,25,0,10
3,81,85,43,50,57,28,24,49,49,26,...,25,7,13,7,3,10,21,5,1,5
4,4,8,5,6,13,3,6,1,5,5,...,0,0,3,1,1,3,3,3,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120887,19,24,14,21,12,5,11,13,12,9,...,2,4,4,4,3,7,13,11,0,8
120888,12,2,14,12,1,0,5,7,14,8,...,11,3,11,10,0,7,9,11,1,5
120889,11,7,6,9,1,1,3,6,5,4,...,7,2,6,17,0,3,5,11,2,6
120890,8,12,5,10,9,2,3,9,12,7,...,14,2,12,16,0,14,15,24,1,10


In [30]:
# data = pd.concat([df, count_df], axis=1)
# data['length'] = data.cor2 - data.cor1
# data['chr'] = [napis[3:] for napis in data.chr]
# data.drop(['cor1', 'cor2', 'Sequence'], axis=1, inplace=True)
# data['chr'] = data['chr'].astype(int)

***
***
***

In [2]:
df = pd.read_csv('trimery.csv')

In [3]:
df = df[df.chr != 1]
df = df[df.chr != 14]
df = df[df.chr != 21]

In [4]:
df.drop('chr', axis=1, inplace=True)
df.drop('length', axis=1, inplace=True)

In [5]:
y = df['Enhancer']
df.drop('Enhancer', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(
     df, y, test_size=0.3, random_state=42)

***

In [38]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [None]:
pred = neigh.predict(X_train)
pred_t = neigh.predict(X_test)

In [40]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.810876572474947
0.7751541235314645
0.6829734900245968
0.7840242507099937


In [41]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6365631675982558
0.4918258633357825
0.4374642594559268
0.5925807389390897


***

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300, learning_rate='adaptive', batch_size=1000).fit(X_train, y_train)

In [89]:
pred = mlp.predict(X_train)
pred_t = mlp.predict(X_test)

In [90]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.7145490640952921
0.7185247054686126
0.47935362777875534
0.676313397034689


In [91]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7131906915186942
0.706916158378993
0.47469628601180147
0.6725125760663225


***

In [43]:
dt = DecisionTreeClassifier(random_state=0, max_depth=8, criterion="entropy", splitter='best')
dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [44]:
pred = dt.predict(X_train)
pred_t = dt.predict(X_test)

In [45]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6913371210695965
0.6330175758959141
0.37896283137469255
0.6257564112282805


In [46]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.6761288812150654
0.5785599571734475
0.3531574217792664
0.6047820852626383


***

In [47]:
rf = RandomForestClassifier(max_depth=10, random_state=0, n_estimators=20)
rf.fit(X_test, y_test)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [48]:
pred = rf.predict(X_train)
pred_t = rf.predict(X_test)

In [49]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.6825199734105931
0.6658011558757014
0.27155643618474995
0.5962411682102925


In [50]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.7767405109595856
0.8929788684389911
0.42806960215668655
0.6997165156246269


***

In [11]:
eta=[0.1, 0.3, 0.5, 0.7]
gamma=[0, 1]
subsample=[1, 0.8, 0.6, 0.4]
max_depth=[5,6,7,8]
colsample_bytree=[1, 0.8, 0.6]

In [None]:
results = {}
param = {}
param['objective'] = "binary:logistic"
param['booster'] = 'gbtree'
param["eval_metric"] = "auc"
param['seed'] = 0
for e in eta:
    for g in gamma:
        for s in subsample:
            for m in max_depth:
                for c in colsample_bytree:
                    param['eta'] = e
                    param['gamma'] = g
                    param['subsample'] = s
                    param['max_depth'] = m
                    param['colsample_bytree']= c
                    xgb = XGBClassifier(**param)
                    xgb.fit(X_train, y_train)
                    pred = xgb.predict(X_train)
                    pred_t = xgb.predict(X_test)
                    results[(e, g, s, m, c)] = ([accuracy_score(y_train, pred), precision_score(y_train, pred), 
                                                recall_score(y_train, pred), roc_auc_score(y_train, pred)], 
                                              [accuracy_score(y_test, pred_t), precision_score(y_test, pred_t), 
                                              recall_score(y_test, pred_t), roc_auc_score(y_test, pred_t)])
                    print(f'{e}, {g}, {s}, {m}, {c}')

In [20]:
for key in results:
    print(f"{results[key]}, {key}")

([0.7400885477417818, 0.7209843354184976, 0.4764279857884668, 0.6847349306853113], [0.7117438763864096, 0.6479762346825102, 0.42766113879585, 0.6489878881393751]), (0.1, 0, 1, 5, 1)
([0.7390977160702863, 0.7176706414473685, 0.4770087455588959, 0.6840740432049418], [0.7105440285622311, 0.6434676434676435, 0.43052038232170575, 0.648684723764592]), (0.1, 0, 1, 5, 0.8)
([0.7395743186464487, 0.7184002464318718, 0.47803361574200604, 0.6846657505769622], [0.7113049076702467, 0.6448780487804878, 0.4319908504207173, 0.6496023563549095]), (0.1, 0, 1, 5, 0.6)
([0.7664396533343367, 0.7598575053679485, 0.5319417873735993, 0.7172085321655647], [0.7123584325890375, 0.6408878504672897, 0.448165999509844, 0.6539963604480364]), (0.1, 0, 1, 6, 1)
([0.7650349299519634, 0.7582589942162533, 0.5284913910904618, 0.7153743346383561], [0.7127096075619678, 0.6422535211267606, 0.44702230209950167, 0.6540173070005032]), (0.1, 0, 1, 6, 0.8)
([0.7636302065695902, 0.7560161084372852, 0.5258950532932495, 0.71371944047

In [19]:
for key in results:
    print(f"{results[key][1][2]}, {key}")

0.42766113879585, (0.1, 0, 1, 5, 1)
0.43052038232170575, (0.1, 0, 1, 5, 0.8)
0.4319908504207173, (0.1, 0, 1, 5, 0.6)
0.448165999509844, (0.1, 0, 1, 6, 1)
0.44702230209950167, (0.1, 0, 1, 6, 0.8)
0.44792092149334206, (0.1, 0, 1, 6, 0.6)
0.4608283636957765, (0.1, 0, 1, 7, 1)
0.45927620292459764, (0.1, 0, 1, 7, 0.8)
0.4570705007760804, (0.1, 0, 1, 7, 0.6)
0.4730005718487052, (0.1, 0, 1, 8, 1)
0.46630177273098605, (0.1, 0, 1, 8, 0.8)
0.46646515807532063, (0.1, 0, 1, 8, 0.6)
0.4341148598970672, (0.1, 0, 0.8, 5, 1)
0.4312556163712115, (0.1, 0, 0.8, 5, 0.8)
0.4300302262887019, (0.1, 0, 0.8, 5, 0.6)
0.452168940446042, (0.1, 0, 0.8, 6, 1)
0.4511886283800343, (0.1, 0, 0.8, 6, 0.8)
0.4510252430356997, (0.1, 0, 0.8, 6, 0.6)
0.46393268523813413, (0.1, 0, 0.8, 7, 1)
0.46197206110611877, (0.1, 0, 0.8, 7, 0.8)
0.4624622171391226, (0.1, 0, 0.8, 7, 0.6)
0.474552732619884, (0.1, 0, 0.8, 8, 1)
0.47038640633935136, (0.1, 0, 0.8, 8, 0.8)
0.4681807041908341, (0.1, 0, 0.8, 8, 0.6)
0.44056858099828444, (0.1, 0

In [10]:
param = {}
param['objective'] = "binary:logistic"
param['booster'] = 'gbtree'
param["eval_metric"] = "auc"
param['eta'] = 0.2
param['gamma'] = 0
param['subsample'] = 1
param['max_depth'] = 7
param['colsample_bytree']=1
param['seed'] = 0
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', predictor=None, ...)

In [12]:
pred = xgb.predict(X_train)
pred_t = xgb.predict(X_test)

In [13]:
print(accuracy_score(y_train, pred))
print(precision_score(y_train, pred))
print(recall_score(y_train, pred))
print(roc_auc_score(y_train, pred))

0.814664308738132
0.8105051197463691
0.6462831374692539
0.7793139066723586


In [14]:
print(accuracy_score(y_test, pred_t))
print(precision_score(y_test, pred_t))
print(recall_score(y_test, pred_t))
print(roc_auc_score(y_test, pred_t))

0.711158584764859
0.6216271673335385
0.49497590066171065
0.6634022230166737


***

***

In [109]:
kfold = model_selection.KFold(n_splits=10, random_state=1, shuffle=True)

In [107]:
estimators = []
estimators.append(('random forest', rf))
estimators.append(('xgb', xgb))

In [None]:
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X_train, y_train, cv=kfold)

In [112]:
results

array([0.6996337 , 0.70695971, 0.69738863, 0.70412383, 0.70408887,
       0.70137083, 0.70030726, 0.69995273, 0.69853463, 0.70137083])