In [None]:
!gcloud services enable ml.googleapis.com
!gcloud services enable compute.googleapis.com

In [None]:
!mkdir friday_training

In [None]:
!touch ./friday_training/__init__.py

In [None]:
%%writefile ./friday_training/train.py
import datetime
import xgboost as xgb
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import subprocess
from google.cloud import storage
import pandas as pd
import numpy as np

from sklearn import preprocessing
from math import sqrt
import datetime
#import matplotlib.pyplot as plt
#import seaborn as sns
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve

# Fill in your Cloud Storage bucket name
BUCKET_ID = 'friday_demo2'

public_bucket = storage.Client().bucket(BUCKET_ID)
blob = public_bucket.blob('Data/train.csv')
blob.download_to_filename('train.csv')

blob = public_bucket.blob('Data/test.csv')
blob.download_to_filename('test.csv')

#Read the data from the bucket
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes


# categorical columns to convert
categorical_columns = ["Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years",
                       "Marital_Status", "Product_Category_1"]


# Join Train and Test Dataset so it can be cleaned all at once
train['source']='train'
test['source']='test'

data = pd.concat([train,test], ignore_index = True, sort = False)

#Get index of all columns with product_category_1 equal 19 or 20 from train and remove since not populated
condition = data.index[(data.Product_Category_1.isin([19,20])) & (data.source == "train")]
data = data.drop(condition)

# define example
#community_area = [num for num in range(78)]
# data = array(data)
#print(community_area)
# one hot encode

# convert categorical data to to numerical values.
# convert data in categorical columns to numerical values
"""encoders = {col:LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    data[col] = encoders[col].fit_transform(data[col])
    #data[col] = to_categorical(data[col])
    #print(data[col])
"""
data = pd.get_dummies(data, columns=categorical_columns, drop_first=False)
#print(data)

totalitem = data['User_ID'].value_counts().sort_index()
totalpurchase = data.groupby('User_ID').sum()['Purchase']
tot = pd.concat([totalitem, totalpurchase], axis =1, keys = ['Total_products', 'Total_purchase'])
data = pd.merge(data, tot, left_on = 'User_ID', right_index = True)
    
#Divide into test and train
train = data.loc[data['source']=="train"]
test = data.loc[data['source']=="test"]

#Drop unnecessary columns:
test.drop(['source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

# remove column we are trying to predict ('Purchase') from features list and also removed product category 2 and 3 due to missing values
train_features = train.drop(['Purchase', 'Product_Category_2', 'Product_Category_3', 'Product_ID', 'User_ID'], axis=1)
test_features = test.drop(['Purchase', 'Product_Category_2', 'Product_Category_3', 'Product_ID', 'User_ID'], axis=1)
# create training labels list
train_labels = train[['Purchase']]
test_labels = test[['Purchase']]

# load data into DMatrix object
dtrain = xgb.DMatrix(train_features, train_labels)
dtest = xgb.DMatrix(test_features)
# train model
bst = xgb.train({}, dtrain, 20)


# Export the model to a file
model = 'model.bst'
bst.save_model('./model.bst')

# Upload the model to Cloud Storage
bucket = storage.Client().bucket(BUCKET_ID)
blob = bucket.blob(model)
blob.upload_from_filename(model)


#tree based learner for onehotencode
from sklearn.metrics import mean_squared_error
xg_reg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 200)

xg_reg.fit(train_features,train_labels)

preds = xg_reg.predict(train_features)
rmse = np.sqrt(mean_squared_error(train_labels, preds))
print("RMSE: %f" % (rmse))


In [None]:
#tree based learner for onehotencode
from sklearn.metrics import mean_squared_error
xg_reg = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 200)

fit = xg_reg.fit(train_features,train_labels)

preds = xg_reg.predict(train_features)
rmse = np.sqrt(mean_squared_error(train_labels, preds))
print("RMSE: %f" % (rmse))

In [None]:
from sklearn.model_selection import StratifiedKFold
n_estimators = range(50, 400, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(xg_reg, param_grid, scoring="neg_mean_squared_error", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(train_features, train_labels)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
#k fold cross validation
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=dtrain, params=params, nfold=3,
                    num_boost_round=250,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)


            

In [None]:
cv_results.tail(50)
#print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
import matplotlib.pyplot as plt
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [50, 50]
plt.show()

In [None]:
xgb = XGBRegressor()
xgb.fit(train_features, train_labels)
imp = pd.DataFrame(xgb.feature_importances_ ,columns = ['Importance'],index = train_features.columns)
imp = imp.sort_values(['Importance'], ascending = False)

print(imp)

In [None]:
from sklearn.model_selection import learning_curve, GridSearchCV
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.07, n_estimators=1000, max_depth=5,
 min_child_weight=1.5, gamma=0.03, subsample=0.95, colsample_bytree=0.4,
 nthread=4, scale_pos_weight=1, seed=27, reg_alpah=0.75, reg_lambda=0.45), 
 param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train_features,train_labels)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
import time

# Define a timestamped job name
JOB_NAME = "friday_training_{}".format(int(time.time()))
BUCKET_NAME = 'friday_demo2'

In [None]:

# Submit the training job:
!gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir gs://$BUCKET_NAME/friday_job_dir \
  --package-path ./friday_training \
  --module-name friday_training.train \
  --region us-east1 \
  --runtime-version=1.12 \
  --python-version=3.5 \
  --scale-tier BASIC \
  --stream-logs \
  -- \
  --bucket-name $BUCKET_NAME

In [None]:
MODEL_NAME = "BlackFridayPredictor"
VERSION_NAME = "friday_predictor"
#VERSION_NAME = "friday_predictor_{}".format(int(time.time()))

In [None]:
!gcloud ml-engine models create $MODEL_NAME --regions us-east1

In [None]:
!gcloud ml-engine versions create $VERSION_NAME \
  --model=$MODEL_NAME \
  --framework=xgboost \
  --origin=gs://$BUCKET_NAME/ \
  --python-version=3.5 \
  --runtime-version=1.12

In [None]:
test_features_10 = test_features.head(10)
test_features_10

In [None]:
DATA_FORMAT="text" # JSON data format
MODEL_NAME = "BlackFridayPredictor"
VERSION_NAME = "friday_predictor"
REGION='us-east1'
JOB_NAME = "friday_training_{}".format(int(time.time()))


In [None]:
INPUT_FILE="data.json"

!gcloud ai-platform predict --model $MODEL_NAME --version \
  $VERSION_NAME --json-instances $INPUT_FILE



In [None]:
%%writefile $INPUT_FILE
[1000004,1216,1,4,7,1,2,1,0,10,0,64902,83667,200699,20230,541656,1333]
[1000009,1063,1,2,17,2,0,0,2,4,0,311554,57076,28791,37165,541656,371]