In [None]:
# One of my solutions for 'Genetic Engineering Attribution Challenge';
# 0.6529 Top-10 Accuracy at the public leaderboard;


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import datetime
from sklearn.model_selection import train_test_split
import gensim
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from tqdm import tqdm
from sklearn import utils


In [None]:
rs = 42
pth = '.../GE/'
np.random.seed(rs)

In [None]:
# Add 3 symbols from the beginning to the end to loop the cycle

def n_grams_with_space(input_list, n):
  out_list = []

  for item in input_list:
    item += item[0:3]
    grams = [item[i:i+n] for i in range(0, len(item)-n+1, n)]
    res = ' '.join(grams)
    out_list.append(res)

  return out_list


In [None]:
train_values = pd.read_csv(pth+'train_values.csv')
test_values = pd.read_csv(pth+'test_values.csv')

In [None]:
train_values['sample']=1
test_values['sample']=2

In [None]:
values = pd.concat([train_values, test_values], ignore_index=True)

In [None]:
def percent_of_element_in_sequence(input_seq, elem):
  x = input_seq.count(elem)
  y = len(input_seq)
  return x / y

percent_of_element_in_sequence_v = np.vectorize(percent_of_element_in_sequence)

In [None]:
elem_list = ['A', 'C', 'G', 'T', 'N']

for item in elem_list:
  values['count_{}'.format(item)] = percent_of_element_in_sequence_v(values['sequence'], item)

In [None]:
values['seq_len'] = values['sequence'].str.len()

In [None]:
e = list(values['sequence'])

r = n_grams_with_space(e, 3)

values['sequence_by_3'] = r

In [None]:
s = values['sample']==1
train = values.loc[s, :]
train.reset_index(inplace=True, drop=True)


s = values['sample']==2
test = values.loc[s, :]
test.reset_index(inplace=True, drop=True)


In [None]:
def vec_for_learning(model, tagged_docs):
  sents = tagged_docs.values
  targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
  return targets, regressors



def get_doc_2_vec_data(train_df, test_df, t_col, processed_field):
    
  train_tagged = train_df.apply(lambda x: TaggedDocument(words=x[processed_field].split(), tags=[x[t_col]]), axis=1)
  test_tagged = test_df.apply(lambda x: TaggedDocument(words=x[processed_field].split(), tags=[x[t_col]]), axis=1)
    
  print(train_tagged.values[0])
    
  model_dbow = Doc2Vec(dm=0, vector_size=250, negative=5, hs=0, min_count=2, window=15)
  model_dbow.random.seed(rs)
  model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])
    
  for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha
        
  y_train, X_train = vec_for_learning(model_dbow, train_tagged)
  y_test, X_test = vec_for_learning(model_dbow, test_tagged)

  return y_train, X_train, y_test, X_test

In [None]:
y_train, X_train, y_test, X_test = get_doc_2_vec_data(train, test, 'sequence_id', 'sequence_by_3')

100%|██████████| 63017/63017 [00:00<00:00, 2585266.29it/s]

TaggedDocument(['CAT', 'GCA', 'TTA', 'GTT', 'ATT', 'AAT', 'AGT', 'AAT', 'CAA', 'TTA', 'CGG', 'GGT', 'CAT', 'TAG', 'TTC', 'ATA', 'GCC', 'CAT', 'ATA', 'TGG', 'AGT', 'TCC', 'GCG', 'TTA', 'CAT', 'AAC', 'TTA', 'CGG', 'TAA', 'ATG', 'GCC', 'CGC', 'CTG', 'GCT', 'GAC', 'CGC', 'CCA', 'ACG', 'ACC', 'CCC', 'GCC', 'CAT', 'TGA', 'CGT', 'CAA', 'TAA', 'TGA', 'CGT', 'ATG', 'TTC', 'CCA', 'TAG', 'TAA', 'CGC', 'CAA', 'TAG', 'GGA', 'CTT', 'TCC', 'ATT', 'GAC', 'GTC', 'AAT', 'GGG', 'TGG', 'AGT', 'ATT', 'TAC', 'GGT', 'AAA', 'CTG', 'CCC', 'ACT', 'TGG', 'CAG', 'TAC', 'ATC', 'AAG', 'TGT', 'ATC', 'ATA', 'TGC', 'CAA', 'GTA', 'CGC', 'CCC', 'CTA', 'TTG', 'ACG', 'TCA', 'ATG', 'ACG', 'GTA', 'AAT', 'GGC', 'CCG', 'CCT', 'GGC', 'ATT', 'ATG', 'CCC', 'AGT', 'ACA', 'TGA', 'CCT', 'TAT', 'GGG', 'ACT', 'TTC', 'CTA', 'CTT', 'GGC', 'AGT', 'ACA', 'TCT', 'ACG', 'TAT', 'TAG', 'TCA', 'TCG', 'CTA', 'TTA', 'CCA', 'TGG', 'TGA', 'TGC', 'GGT', 'TTT', 'GGC', 'AGT', 'ACA', 'TCA', 'ATG', 'GGC', 'GTG', 'GAT', 'AGC', 'GGT', 'TTG', 'ACT', 'CAC


100%|██████████| 63017/63017 [00:00<00:00, 2860431.54it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3076154.87it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3030967.10it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2587468.11it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2667155.62it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3173704.46it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3088520.02it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2656139.64it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2863530.49it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3025450.76it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3206546.91it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2506067.71it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3088772.67it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3158722.89it/s]
100%|██████████| 63017/63017 [00:00<00:00, 2839962.34it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3057473.34it/s]
100%|██████████| 63017/63017 [00:00<00:00, 3074401.61it

In [None]:
# Saving both train & test

z = pd.DataFrame(X_train)
z2 = pd.DataFrame(y_train)
z2.columns=['sequence_id']
z3 = pd.concat([z, z2], axis=1)
z3.to_csv(pth+'train_doc2vec2.csv', sep=',', index=False)


z = pd.DataFrame(X_test)
z2 = pd.DataFrame(y_test)
z2.columns=['sequence_id']
z3 = pd.concat([z, z2], axis=1)
z3.to_csv(pth+'test_doc2vec2.csv', sep=',', index=False)

In [None]:
train_doc2vec = pd.read_csv(pth+'train_doc2vec2.csv')

test_doc2vec = pd.read_csv(pth+'test_doc2vec2.csv')


In [None]:
test_doc2vec.shape

(18816, 251)

In [None]:
x = list(values.columns)

x.remove('sample')
x.remove('sequence_id')
x.remove('sequence')

In [None]:
flags_features_binary = x.copy()

In [None]:
# train_values - > train
train_full = pd.concat([train_doc2vec, train.loc[:, flags_features_binary]], axis=1)

In [None]:
test_full = pd.concat([test_doc2vec, test.loc[:, flags_features_binary]], axis=1)

In [None]:
train_full.to_csv(pth+'train_full2.csv', index=False, sep=',')

test_full.to_csv(pth+'test_full2.csv', index=False, sep=',')



In [None]:
# Перезапуск отсюда
if True:
  train_full = pd.read_csv(pth+'train_full2.csv', sep=',')
  test_full = pd.read_csv(pth+'test_full2.csv', sep=',')

train_labels = pd.read_csv(pth+'train_labels.csv')

In [None]:
train_full.drop(['sequence_by_3'], axis=1, inplace=True)
test_full.drop(['sequence_by_3'], axis=1, inplace=True)


In [None]:

def col_for_single_1_in_row(df, list_cols, new_col_name):
  w = []
  for i in range(len(df)):
    s = df.loc[i, list_cols]==1
    e = [ind for ind, x in enumerate(s) if x==True]
    w.append(list_cols[e[0]])
  df[new_col_name] = w
  return df

In [None]:
y = list(train_labels.columns)
y.remove('sequence_id')



In [None]:
train_labels = col_for_single_1_in_row(train_labels, y, 'target')

In [None]:
train_labels['target'].value_counts(normalize=True)

I7FXTVDP    0.131488
RKJHZGDQ    0.043353
GTVTUGVY    0.042401
A18S09P2    0.016884
Q2K8NHZY    0.015440
              ...   
G2P73NZ0    0.000048
58BSUZQB    0.000048
WB78G3XF    0.000032
ON9AXMKF    0.000016
0L3Y6ZB2    0.000016
Name: target, Length: 1314, dtype: float64

In [None]:
le = LabelEncoder()

t_classes = le.fit_transform(train_labels['target'])

train_labels['target_enc'] = t_classes

mapping = dict(zip(le.classes_, range(len(le.classes_))))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_full.drop(['sequence_id'], axis=1), 
                                                    train_labels['target_enc'], test_size=0.2, random_state=11)

In [None]:
features_cat = ['bacterial_resistance_ampicillin',
 'bacterial_resistance_chloramphenicol',
 'bacterial_resistance_kanamycin',
 'bacterial_resistance_other',
 'bacterial_resistance_spectinomycin',
 'copy_number_high_copy',
 'copy_number_low_copy',
 'copy_number_unknown',
 'growth_strain_ccdb_survival',
 'growth_strain_dh10b',
 'growth_strain_dh5alpha',
 'growth_strain_neb_stable',
 'growth_strain_other',
 'growth_strain_stbl3',
 'growth_strain_top10',
 'growth_strain_xl1_blue',
 'growth_temp_30',
 'growth_temp_37',
 'growth_temp_other',
 'selectable_markers_blasticidin',
 'selectable_markers_his3',
 'selectable_markers_hygromycin',
 'selectable_markers_leu2',
 'selectable_markers_neomycin',
 'selectable_markers_other',
 'selectable_markers_puromycin',
 'selectable_markers_trp1',
 'selectable_markers_ura3',
 'selectable_markers_zeocin',
 'species_budding_yeast',
 'species_fly',
 'species_human',
 'species_mouse',
 'species_mustard_weed',
 'species_nematode',
 'species_other',
 'species_rat',
 'species_synthetic',
 'species_zebrafish']

In [None]:
full_run = True
valid_run = False

if (full_run and valid_run) or not(full_run or valid_run):
  full_run = False
  valid_run = True

print('start training')
print(datetime.datetime.now())

lgbm_classifier = lgb.LGBMClassifier(
    objective='multiclass',
    boosting='gbdt',
    learning_rate = 0.01,
    metric='multi_logloss',
    max_depth = 50,
    #num_leaves = 80,
    n_estimators = 150,
    num_class = 1314,
    random_state = rs)





if valid_run:
  lgbm_classifier.fit(X_train, y_train, categorical_feature=features_cat)
  

if full_run:
  lgbm_classifier.fit(train_full.drop(['sequence_id'], axis=1), train_labels['target_enc'], categorical_feature=features_cat)


print('end training')
print(datetime.datetime.now())

if valid_run:
  y_pred = lgbm_classifier.predict(X_test)
  print(str(np.mean(y_pred==y_test)))


if full_run:
  y_pred = lgbm_classifier.predict_proba(test_full.drop(['sequence_id'], axis=1))

  # Creating a list of laboratories
  list_labs = list(train_labels.columns)
  list_labs.remove('target')
  list_labs.remove('sequence_id')
  list_labs.remove('target_enc')

  df_res = pd.concat([pd.DataFrame(test_full['sequence_id']), pd.DataFrame(y_pred)], axis=1)
  df_res.columns = ['sequence_id'] + list_labs

  df_res.to_csv(pth+'submission1.csv', index=False, sep=',')

print('finish')



start training
2020-08-22 12:29:42.698648


New categorical_feature is ['bacterial_resistance_ampicillin', 'bacterial_resistance_chloramphenicol', 'bacterial_resistance_kanamycin', 'bacterial_resistance_other', 'bacterial_resistance_spectinomycin', 'copy_number_high_copy', 'copy_number_low_copy', 'copy_number_unknown', 'growth_strain_ccdb_survival', 'growth_strain_dh10b', 'growth_strain_dh5alpha', 'growth_strain_neb_stable', 'growth_strain_other', 'growth_strain_stbl3', 'growth_strain_top10', 'growth_strain_xl1_blue', 'growth_temp_30', 'growth_temp_37', 'growth_temp_other', 'selectable_markers_blasticidin', 'selectable_markers_his3', 'selectable_markers_hygromycin', 'selectable_markers_leu2', 'selectable_markers_neomycin', 'selectable_markers_other', 'selectable_markers_puromycin', 'selectable_markers_trp1', 'selectable_markers_ura3', 'selectable_markers_zeocin', 'species_budding_yeast', 'species_fly', 'species_human', 'species_mouse', 'species_mustard_weed', 'species_nematode', 'species_other', 'species_rat', 'species_synthetic

end training
2020-08-22 18:06:17.765742
finish
