In [None]:
import pandas as pd
import numpy as np
from ThymeBoost import ThymeBoost as tb
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from datetime import datetime

train = pd.read_csv('train (3).csv')
test = pd.read_csv('test_sfo_processed-2.csv')



In [None]:
timestamps = train['week_beg'].values * 100
test_timestamps = [timestamps[-1] + 604800 * i for i in range(1, 30)]

train[train.select_dtypes('object').columns] = train.loc[:, train.select_dtypes('object').columns].replace(' ', 0).astype('float')
test[test.select_dtypes('object').columns] = test.loc[:, test.select_dtypes('object').columns].replace(' ', 0).astype('float')

train.insert(0, 'week_number', train.index.values)

for col in ['_competitor_rating',
            '_competitor_coverage',
            '_competitor_money',
            '_competitor_digital',
            '_competitor_sponsorship_money',
            '_competitor_oon_money',
            '_competitor_radio_money',
            '_competitor_total']:

    train['competitors_'+col.rsplit('_', 1)[-1]] = train[[str(i) + col for i in range(1, 14)]].sum(axis=1)
    train.drop(columns=[str(i) + col for i in range(1, 14)], inplace=True)



In [None]:
col = '_competitor_tv_reg'
train['competitors_tv_reg'] = train[[str(i) + '_competitor_tv_reg' for i in [3, 5, 6, 7, 9, 12]]].sum(axis=1)
train.drop(columns=[str(i) + '_competitor_tv_reg' for i in [3, 5, 6, 7, 9, 12]], inplace=True)

for col in ['_video_rating',
            '_video_money',
            '_video_coverage_5']:

    train.drop(columns=[str(i) + col for i in range(1, 5)], inplace=True)




In [None]:
num_cols = train.columns.values.tolist()
num_cols.remove('week_beg')
num_cols.remove('revenue')

target_col = ['revenue']
train = train[num_cols + target_col]

ss = StandardScaler()
train[num_cols] = ss.fit_transform(train[num_cols])

ss_target = StandardScaler()
train[['revenue']] = ss_target.fit_transform(train[['revenue']])




In [None]:
X_train, y_train = train[num_cols], train[target_col]

X_train_list = X_train.values.tolist()
y_train_list = y_train.values.flatten().tolist()

tb_model = tb()
tb_model.fit(X_train_list, y_train_list)

pred = tb_model.predict(X_test.values)

pred_unscaled = ss_target.inverse_transform(np.array(pred).reshape(-1, 1)).flatten()
train_revenue = ss_target.inverse_transform(train['revenue'].values.reshape(-1, 1)).flatten()

weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in timestamps]
test_weeks = [datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d') for timestamp in test_timestamps]

plt.figure(figsize=(13,10))
plt.plot(weeks, train_revenue/10**6, label='Historical revenue')
plt.plot(test_weeks, pred_unscaled/10**6, color='red', label='Predicted revenue')
plt.title('Historical and Predicted Revenue')
plt.xlabel('Date')
plt.ylabel('Revenue (Millions)')
plt.legend()
plt.xticks((weeks+test_weeks)[::10] , fontsize=9, rotation=45)
plt.show()