In [None]:
from datetime import datetime, timedelta

# Notebook parameters

In [None]:
sample_day = (datetime.now().date() - timedelta(days=1)).strftime('%Y-%m-%d')
source_folder = 'gs://dsart_nearline1/pipelines/dataset1/'
model_folder = 'gs://dsart_nearline1/models/'
model_file = 'bird_0_0_1.xgb.json'
target_folder = 'gs://dsart_nearline1/pipelines/bird1/'

In [None]:
sample_day

# Dependencies

In [None]:
! pip install xgboost

In [None]:
import os
import pandas
import time
import numpy
import json
import xgboost
from dotenv import load_dotenv

# Load data

In [None]:
def clean_column_names(df):
    r = {}
    for x in list(df.columns):
        if '\r' in x:
            r[x] = x.replace('\r', '')
    if len(r)>0:        
        print('renaming', r)
        df.rename(columns=r, inplace=True)

In [None]:
file1 = sample_day+'.csv'
file1_source = source_folder + file1
file1_local = sample_day+'_dataset1.csv'
file1_source, file1_local

In [None]:
! gsutil cp {file1_source} {file1_local}

In [None]:
df1 = pandas.read_csv(file1_local, lineterminator='\n')

In [None]:
clean_column_names(df1)

In [None]:
df1

# Load Bird Model

In [None]:
file2_source = model_folder + model_file
file2_source

In [None]:
! gsutil cp {file2_source} .

In [None]:
bird = xgboost.Booster()

In [None]:
bird.load_model(model_file)

In [None]:
features = bird.feature_names
features

# Apply model

In [None]:
dmatrix = xgboost.DMatrix(df1[features])
predict = bird.predict(dmatrix)
len(predict)

In [None]:
df1['predict_like'] = (predict * 100).astype(int)

## Convert column types

In [None]:
features = [x for x in df1.columns if x.startswith('q_') or x.startswith('c_')  or x.startswith('t_')]
features

In [None]:
for f in features:
    df1[f] = (df1[f] * 100).astype(int)

In [None]:
integers = [
 'num_like',
 'num_recast',
 'num_reply',
 'link_from_add',
 'link_from_del',
 'link_to_add',
 'link_to_del',
 'num_follower',
 'num_following',
 'num_follower_bin',
 'text_len_bin'
]

In [None]:
for f in integers:
    df1[f] = df1[f].fillna(0).astype(int)

In [None]:
booleans = [
 'target_05',
 'target_10',
 'target_15',
 'target_20',
 'target_25'
]

In [None]:
for f in booleans:
    df1[f] = df1[f].astype(bool)

In [None]:
try:
    df1['timestamp'] = pandas.to_datetime(df1['timestamp'],unit='s')
except:
    df1['timestamp'] = None

# Save output

In [None]:
df1

In [None]:
output_file = sample_day + '.csv'
output_file

In [None]:
df1.to_csv(output_file, index=False, float_format='%.2f')

In [None]:
! gsutil cp {output_file} {target_folder}