In [1]:
from pyspark import RDD, SparkContext

import pandas as pd
import json

import pickle

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error




In [2]:
#train_data = "../resource/asnlib/publicdata/train_review.json"
#user_data = '../resource/asnlib/publicdata/user.json'
#business_data = '../resource/asnlib/publicdata/business.json'

train_data = "data/review_train.json"
user_data =  "data/user.json"
business_data = 'data/business.json'

In [3]:
def feature_select_user(partition):
    for row in partition:
        data = json.loads(row)
        yield data['user_id'], data['useful'], data['average_stars'], data['review_count']


def feature_select_business(partition):
    for row in partition:
        data = json.loads(row)
        yield data['business_id'], float(data['stars']), float(data['review_count']), data['state']

def feature_select_train(partition):
    for row in partition:
        data = json.loads(row)
        yield data['user_id'], data['business_id'], float(data['stars'])

In [4]:
test_partitions = 2
sc = SparkContext("local", "HW3").getOrCreate()
sc.setLogLevel("ERROR")

# Note: During predict, just map review as 0 and overwrite with prediction

In [5]:
reviews_RDD = sc.textFile(train_data).mapPartitions(feature_select_train)

In [6]:
biz_RDD = sc.textFile(business_data).mapPartitions(feature_select_business).map(lambda x:(x[0],x))

In [7]:
user_RDD = sc.textFile(user_data).mapPartitions(feature_select_user).map(lambda x:(x[0],x))

In [8]:
rev_w_biz_RDD = reviews_RDD.map(lambda x: (x[1], (x[0],x[2]))).leftOuterJoin(biz_RDD).map(lambda x: (x[1][0][0],(list([x[1][0][1]])+list(x[1][1]))))

In [9]:
rev_w_biz_plus_user_RDD = rev_w_biz_RDD.leftOuterJoin(user_RDD).map(lambda x: (list([x[0]]) +  list([x[1][0][1]])+ x[1][0][2:] + list(x[1][1][1:]) + list([x[1][0][0]])))

In [10]:
main_train_file = rev_w_biz_plus_user_RDD.collect()

In [11]:
data_df = pd.DataFrame(main_train_file)

In [12]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data_df[4].unique())
data_df[4] = le.transform(data_df[4])

In [13]:
train_df = data_df.iloc[: , 2:-1 ]
test_df = data_df.iloc[: , -1: ]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_df, test_df, test_size=0.1, random_state=44)

In [24]:
xgb_model = XGBRegressor(num_parallel_tree=2, max_depth=8, n_estimators=100, learning_rate=0.10)

In [25]:
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
rms = mean_squared_error(y_test, xgb_preds, squared=True)
print(rms)

0.9749636019730629


# Wandb

In [16]:
import wandb

In [17]:
wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mrbzkafzx[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
from wandb.xgboost import WandbCallback
import numpy as np

In [None]:
def train():
    with wandb.init() as run:
        bst_params = {
            'objective': 'reg:squarederror',
            'n_estimators': 60,
            #'booster': run.config.booster,
            'learning_rate': run.config.learning_rate,
            'max_depth': run.config.max_depth,
            'eval_metric': ['rmse'],
            'tree_method': 'gpu_hist',
        }

        # Initialize the XGBoostClassifier
        xgbmodel = XGBRegressor(**bst_params)

        # Train the model, using the wandb_callback for logging
        xgbmodel.fit(X_train, y_train,
                     eval_set=[(X_test, y_test)],
                     callbacks=[
                         WandbCallback(log_model=True,
                                       log_feature_importance=False,
                                       define_metric=True)
                     ],
                     verbose=False)

        preds = xgbmodel.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, preds))
        print("RMSE: %f" % (rmse))
        wandb.log({"Valid_RMSE": rmse})

In [None]:
sweep_config = {
  "name" : "btc_hyperparam_search2",
  "method" : "random",
  "parameters" : {
    "learning_rate": {
      "min": 0.01,
      "max": 0.2
    },
    "max_depth": {
        "min": 4,
        "max": 10
    },
    "n_estimators": {
        "values": [50,100,150,200,250]
    },
  }
}



In [None]:
sweep_id = wandb.sweep(sweep_config, project='btc_hyperparam_search')

In [63]:
with open('model.pkl','wb') as f:
    pickle.dump(xgb_model,f)