Code based on: https://www.kaggle.com/competitions/see-click-predict-fix/discussion/6466

In [1]:
import sys
import pandas as pd
import numpy as np


from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
"""
usage:

To use the default date range and output result as 'submit1.csv'
python meanValue.py 1

To use all data from Jan. 1 to Apr. 30
python meanValue.py 1 '2013-01-01'
"""

#edit DATA_PATH to the directory with train and test
DATA_PATH = ''
#edit SUBMIT_PATH to where you want it to go
SUBMIT_PATH = 'submit%d.csv'
TRAIN_PATH = 'input/train.csv'
TEST_PATH = 'input/test.csv'
LABEL_COLS = ['num_views', 'num_votes', 'num_comments']
SUBMIT_COLS = ['id','num_views', 'num_votes', 'num_comments']


def meanValueByCityAndSource(submit_num, start='2013-04-03'):
    train = pd.read_csv(TRAIN_PATH)
    test = pd.read_csv(TEST_PATH)
    #Sets the index to a time series so we can slice by date.
    train.index = pd.DatetimeIndex(train.created_time)
    train = train[start:]
    #log transform targets
    for col in LABEL_COLS:
        train[col] = np.log(train[col] + 1)
    train.source.fillna('nodata', inplace=True)
    test.source.fillna('nodata', inplace=True)
    #collapse source to 3 values: remote_api=2, city_initiated=1, everything else=0
    train['src'] = 2*(train.source == 'remote_api_created') + (train.source == 'city_initiated')
    test['src'] = 2*(test.source == 'remote_api_created')+ (test.source == 'city_initiated')
    #add city: NH=0, CHI=1, RICH=2, OAK=3
    train['city'] = 2*(train.latitude < 40) + (train.longitude < -80)
    test['city'] = 2*(test.latitude < 40) + (test.longitude < -80)
    train = train[['city', 'src', 'num_views', 'num_votes', 'num_comments']]
    #predict the mean value, grouped by city and (reduced) source
    mean_vals = train.groupby(['city', 'src']).median()
    test = test.merge(mean_vals,
             how = 'left', 
             left_on = ['city', 'src'],
             right_index = True,
             sort = False,
             copy = False)
    #raw transform predictions
    for col in LABEL_COLS:
        test[col] = np.exp(test[col]) - 1
    submitpath = SUBMIT_PATH % submit_num
    test[SUBMIT_COLS].to_csv(submitpath, float_format='%.6f', index=False)
  

# if __name__ ='__main__':
meanValueByCityAndSource(1,'2013-01-01')
#     submit_num = int(sys.argv[1])
#     if len(sys.argv) == 3:
#         meanValueByCityAndSource(submit_num, sys.argv[2])
#     else:
#         meanValueByCityAndSource(submit_num)

In [3]:
#edit DATA_PATH to the directory with train and test
DATA_PATH = ''
#edit SUBMIT_PATH to where you want it to go
SUBMIT_PATH = 'submit%d.csv'
TRAIN_PATH = 'input/train.csv'
SAMPLE_PATH = 'input/sampleSubmission.csv'
TEST_PATH = 'input/test.csv'
LABEL_COLS = ['num_views', 'num_votes', 'num_comments']
TRAIN_COLS = ["latitude", "longitude", "source"]
REDUCED_TRAIN_COLS = [""]
SUBMIT_COLS = ['id','num_views', 'num_votes', 'num_comments']

start='2013-02-01'
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sub = pd.read_csv(SAMPLE_PATH)

In [4]:
#Sets the index to a time series so we can slice by date.
train.index = pd.DatetimeIndex(train.created_time)
train = train[start:]
#log transform targets
for col in LABEL_COLS:
    train[col] = np.log(train[col] + 1)
train.source.fillna('nodata', inplace=True)
test.source.fillna('nodata', inplace=True)
train["source"] = 2*(train.source == 'remote_api_created') + (train.source == 'city_initiated')
test["source"] = 2*(test.source == 'remote_api_created') + (test.source == 'city_initiated')
#collapse source to 3 values: remote_api=2, city_initiated=1, everything else=0
train['src'] = 2*(train.source == 'remote_api_created') + (train.source == 'city_initiated')
test['src'] = 2*(test.source == 'remote_api_created')+ (test.source == 'city_initiated')
#add city: NH=0, CHI=1, RICH=2, OAK=3
train['city'] = 2*(train.latitude < 40) + (train.longitude < -80)
test['city'] = 2*(test.latitude < 40) + (test.longitude < -80)

y = train[["num_views", "num_votes", "num_comments"]]
# y_views = train[['num_views']]
# y_votes = train[['num_votes']]
# y_comments = train[['num_comments']]

train = train[['city', 'src']]
test = test[['city', 'src']]

In [5]:
train

Unnamed: 0_level_0,city,src
created_time,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-02-01 00:00:48,1,0
2013-02-01 00:01:13,1,0
2013-02-01 00:02:41,1,0
2013-02-01 00:02:47,3,0
2013-02-01 00:03:19,1,0
...,...,...
2013-04-30 23:39:39,1,0
2013-04-30 23:39:39,1,0
2013-04-30 23:45:39,1,0
2013-04-30 23:45:39,1,0


In [6]:
test

Unnamed: 0,city,src
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
149570,0,0
149571,2,0
149572,2,0
149573,2,0


In [7]:
submit_num = 2
models = [XGBRegressor(), LGBMRegressor()]
for model in models:
    print(model)
    model = MultiOutputRegressor(model)
    model.fit(train, y)
    sub[LABEL_COLS] = model.predict(test)
    for col in LABEL_COLS:
        sub[col] = np.exp(sub[col]) - 1
        sub[col] = sub[col].clip(0)
    submit_path = SUBMIT_PATH % submit_num
    sub.to_csv(submit_path, float_format='%.6f', index=False)
    submit_num += 1

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


LGBMRegressor()


In [8]:
submit1 = pd.read_csv("submit1.csv")
submit2 = pd.read_csv("submit2.csv")
submit3 = pd.read_csv("submit3.csv")
sub[LABEL_COLS] = (0.8*submit1[LABEL_COLS] + 0.1*submit2[LABEL_COLS] + 0.1*submit3[LABEL_COLS])

submitpath = "ensemble.csv"
sub.to_csv(submitpath, float_format='%.6f', index=False)