Hello World!

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import sklearn
import time
from lightgbm import LGBMClassifier
import redis
if (os.getcwd().endswith("notebooks")):
    os.chdir("..")
sys.path.insert(0, os.path.abspath("tests/benchmark_scripts"))
print(os.getcwd())
import music_utils
from music_utils import get_features_from_redis

/mnt/c/Users/Peter/Documents/GitHub/Willump-Simple


In [2]:
db = redis.StrictRedis(host="localhost")
train_X, test_X, train_y, test_y = music_utils.load_music_dataset(db)
train_cascades_dict = {}

In [3]:
def music_train(y, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in music_utils.FEATURES if f in X.columns]]
    model = LGBMClassifier(
        n_jobs=1,
        learning_rate=0.1,
        num_leaves=(2 ** 8),
        max_depth=15,
        metric="auc")
    model = model.fit(X, y)
    return model


def music_predict(model, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in music_utils.FEATURES if f in X.columns]]
    if len(X) == 0:
        return np.zeros(0, dtype=np.float32)
    else:
        return model.predict(X)


def music_predict_proba(model, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in music_utils.FEATURES if f in X.columns]]
    return model.predict_proba(X)[:, 1]


def music_score(true_y, pred_y):
    fpr, tpr, _ = sklearn.metrics.roc_curve(true_y, pred_y, pos_label=1)
    return sklearn.metrics.auc(fpr, tpr)

In [4]:
from willump.evaluation.willump_executor import willump_execute

@willump_execute(train_function=music_train,
                 predict_function=music_predict,
                 predict_proba_function=music_predict_proba,
                 score_function=music_score,
                 train_cascades_dict=train_cascades_dict)
def music_train_pipeline(input_X, input_y):
    user_latent_features = get_features_from_redis(input_X, column="msno", name="features_uf", db=db)
    song_latent_features = get_features_from_redis(input_X, column="song_id", name="features_sf", db=db)
    user_cluster_features = get_features_from_redis(input_X, column="cluster_msno_25", name="uc_features", db=db)
    song_cluster_features = get_features_from_redis(input_X, column="cluster_song_id_25", name="sc_features", db=db)
    artist_cluster_features = get_features_from_redis(input_X, column="cluster_artist_name_25", name="ac_features", db=db)
    user_features = get_features_from_redis(input_X, column="msno", name="us_features", db=db)
    song_features = get_features_from_redis(input_X, column="song_id", name="ss_features", db=db)
    artist_features = get_features_from_redis(input_X, column="artist_name", name="as_features", db=db)
    genre_features = get_features_from_redis(input_X, column="genre_max", name="gs_features", db=db)
    city_features = get_features_from_redis(input_X, column="city", name="cs_features", db=db)
    ages_features = get_features_from_redis(input_X, column="bd", name="ages_features", db=db)
    language_features = get_features_from_redis(input_X, column="language", name="ls_features", db=db)
    gender_features = get_features_from_redis(input_X, column="gender", name="gender_features", db=db)
    composer_features = get_features_from_redis(input_X, column="composer", name="composer_features", db=db)
    lyrs_features = get_features_from_redis(input_X, column="lyricist", name="lyrs_features", db=db)
    sns_features = get_features_from_redis(input_X, column="source_screen_name", name="sns_features", db=db)
    stabs_features = get_features_from_redis(input_X, column="source_system_tab", name="stabs_features", db=db)
    stypes_features = get_features_from_redis(input_X, column="source_type", name="stypes_features", db=db)
    regs_features = get_features_from_redis(input_X, column="registered_via", name="regs_features", db=db)
    return music_train(input_y,
                       [user_latent_features, song_latent_features, user_cluster_features, song_cluster_features,
                        artist_cluster_features, user_features, song_features, artist_features, genre_features,
                        city_features, ages_features, language_features, gender_features, composer_features,
                        lyrs_features, sns_features, stabs_features, stypes_features,
                        regs_features])

In [5]:
train_X_features, feature_costs, feature_names = music_utils.compute_features(train_X, db)
print(feature_costs)

{'user_latent_features': 1.0, 'song_latent_features': 1.0, 'user_cluster_features': 1.0, 'song_cluster_features': 1.0, 'artist_cluster_features': 1.0, 'user_features': 1.0, 'song_features': 1.0, 'artist_features': 1.0, 'genre_features': 1.0, 'city_features': 1.0, 'ages_features': 1.0, 'language_features': 1.0, 'gender_features': 1.0, 'composer_features': 1.0, 'lyrs_features': 1.0, 'sns_features': 1.0, 'stabs_features': 1.0, 'stypes_features': 1.0, 'regs_features': 1.0}


In [6]:
from willump.evaluation import cascades_construct

cascades_train_X, cascades_valid_X, cascades_train_y, cascades_valid_y = \
    cascades_construct.train_test_split(train_X_features, train_y, test_size=0.25, random_state=42)
train_set_full_model = music_train(cascades_train_y, cascades_train_X)

feature_importances = cascades_construct.calculate_feature_importances(train_set_full_model,
                                                                       cascades_valid_X, cascades_valid_y,
                                                                       music_predict, music_score,
                                                                       feature_names)
print(feature_importances)

    

{'user_latent_features': 0.10213267431559958, 'song_latent_features': 0.02305952441123338, 'user_cluster_features': 0.03969030068127466, 'song_cluster_features': 0.038511637001114485, 'artist_cluster_features': 0.010671684671719195, 'user_features': 0.015471531976813724, 'song_features': -0.0008132200801782563, 'artist_features': 0.0017234040855196309, 'genre_features': 0.004596475780300091, 'city_features': 0.006085051594384772, 'ages_features': 0.003875165264303093, 'language_features': 0.002185678601165697, 'gender_features': 0.0009996328937811327, 'composer_features': 0.00040907071583040633, 'lyrs_features': -0.0014232016450616003, 'sns_features': 0.033414313950302255, 'stabs_features': 0.01599452532899892, 'stypes_features': 0.007626831208284779, 'regs_features': 0.006837818857022726}


In [7]:
total_feature_cost = sum(feature_costs.values())
cost_cutoff = 0.5 * total_feature_cost
selected_indices = cascades_construct.select_features(feature_costs, feature_importances, cost_cutoff)
selected_features = [feature_names[i] for i in selected_indices]

print(selected_features)

selected_feature_cost = sum(feature_costs[feature_names[i]] for i in selected_indices)

threshold, expected_cost = cascades_construct.calculate_feature_set_performance(cascades_train_X, cascades_train_y, 
                                                             cascades_valid_X, cascades_valid_y,
                                                             selected_indices,
                                                             music_train, music_predict,
                                                             music_predict_proba, music_score,
                                                             train_set_full_model,
                                                             selected_feature_cost,
                                                             total_feature_cost)

print("Threshold: %f Expected Cost: %f" % (threshold, expected_cost))

['user_latent_features', 'song_latent_features', 'user_cluster_features', 'song_cluster_features', 'artist_cluster_features', 'user_features', 'sns_features', 'stabs_features', 'stypes_features']
Threshold: 0.600000 Expected Cost: 10.823030


In [8]:
total_feature_cost = sum(feature_costs.values())
cost_cutoff = 0.25 * total_feature_cost
selected_indices = cascades_construct.select_features(feature_costs, feature_importances, cost_cutoff)
selected_features = [feature_names[i] for i in selected_indices]

print(selected_features)

selected_feature_cost = sum(feature_costs[feature_names[i]] for i in selected_indices)

threshold, expected_cost = cascades_construct.calculate_feature_set_performance(cascades_train_X, cascades_train_y, 
                                                             cascades_valid_X, cascades_valid_y,
                                                             selected_indices,
                                                             music_train, music_predict,
                                                             music_predict_proba, music_score,
                                                             train_set_full_model,
                                                             selected_feature_cost,
                                                             total_feature_cost)

print("Threshold: %f Expected Cost: %f" % (threshold, expected_cost))

['user_latent_features', 'user_cluster_features', 'song_cluster_features', 'sns_features']
Threshold: 0.600000 Expected Cost: 6.479664


In [9]:
cascades_params = {}
cascades_params["selected_feature_indices"] = selected_indices
cascades_params["cascade_threshold"] = threshold
full_model = music_train(train_y, train_X_features)
cascades_params["full_model"] = full_model
approximate_model = music_train(train_y, [train_X_features[i] for i in selected_indices])
cascades_params["approximate_model"] = approximate_model

In [None]:
def music_eval_pipeline(input_X, model):
    user_latent_features = get_features_from_redis(input_X, column="msno", name="features_uf", db=db)
    song_latent_features = get_features_from_redis(input_X, column="song_id", name="features_sf", db=db)
    user_cluster_features = get_features_from_redis(input_X, column="cluster_msno_25", name="uc_features", db=db)
    song_cluster_features = get_features_from_redis(input_X, column="cluster_song_id_25", name="sc_features", db=db)
    artist_cluster_features = get_features_from_redis(input_X, column="cluster_artist_name_25", name="ac_features", db=db)
    user_features = get_features_from_redis(input_X, column="msno", name="us_features", db=db)
    song_features = get_features_from_redis(input_X, column="song_id", name="ss_features", db=db)
    artist_features = get_features_from_redis(input_X, column="artist_name", name="as_features", db=db)
    genre_features = get_features_from_redis(input_X, column="genre_max", name="gs_features", db=db)
    city_features = get_features_from_redis(input_X, column="city", name="cs_features", db=db)
    ages_features = get_features_from_redis(input_X, column="bd", name="ages_features", db=db)
    language_features = get_features_from_redis(input_X, column="language", name="ls_features", db=db)
    gender_features = get_features_from_redis(input_X, column="gender", name="gender_features", db=db)
    composer_features = get_features_from_redis(input_X, column="composer", name="composer_features", db=db)
    lyrs_features = get_features_from_redis(input_X, column="lyricist", name="lyrs_features", db=db)
    sns_features = get_features_from_redis(input_X, column="source_screen_name", name="sns_features", db=db)
    stabs_features = get_features_from_redis(input_X, column="source_system_tab", name="stabs_features", db=db)
    stypes_features = get_features_from_redis(input_X, column="source_type", name="stypes_features", db=db)
    regs_features = get_features_from_redis(input_X, column="registered_via", name="regs_features", db=db)
    return music_predict(model,
                         [user_latent_features, song_latent_features, user_cluster_features, song_cluster_features,
                          artist_cluster_features, user_features, song_features, artist_features, genre_features,
                          city_features, ages_features, language_features, gender_features, composer_features,
                          lyrs_features, sns_features, stabs_features, stypes_features,
                          regs_features])

time_start = time.time()
preds = music_eval_pipeline(test_X, full_model)
time_elapsed = time.time() - time_start
print("Elapsed Time %fs Num Rows %d Throughput %f rows/sec" %
      (time_elapsed, len(test_X), len(test_X) / time_elapsed))

print("AUC Score: %f" % music_score(preds, test_y))

In [None]:
@willump_execute(predict_function=music_predict,
                 predict_proba_function=music_predict_proba,
                 predict_cascades_dict=cascades_params)
def music_eval_pipeline(input_X, model):
    user_latent_features = get_features_from_redis(input_X, column="msno", name="features_uf", db=db)
    song_latent_features = get_features_from_redis(input_X, column="song_id", name="features_sf", db=db)
    user_cluster_features = get_features_from_redis(input_X, column="cluster_msno_25", name="uc_features", db=db)
    song_cluster_features = get_features_from_redis(input_X, column="cluster_song_id_25", name="sc_features", db=db)
    artist_cluster_features = get_features_from_redis(input_X, column="cluster_artist_name_25", name="ac_features", db=db)
    user_features = get_features_from_redis(input_X, column="msno", name="us_features", db=db)
    song_features = get_features_from_redis(input_X, column="song_id", name="ss_features", db=db)
    artist_features = get_features_from_redis(input_X, column="artist_name", name="as_features", db=db)
    genre_features = get_features_from_redis(input_X, column="genre_max", name="gs_features", db=db)
    city_features = get_features_from_redis(input_X, column="city", name="cs_features", db=db)
    ages_features = get_features_from_redis(input_X, column="bd", name="ages_features", db=db)
    language_features = get_features_from_redis(input_X, column="language", name="ls_features", db=db)
    gender_features = get_features_from_redis(input_X, column="gender", name="gender_features", db=db)
    composer_features = get_features_from_redis(input_X, column="composer", name="composer_features", db=db)
    lyrs_features = get_features_from_redis(input_X, column="lyricist", name="lyrs_features", db=db)
    sns_features = get_features_from_redis(input_X, column="source_screen_name", name="sns_features", db=db)
    stabs_features = get_features_from_redis(input_X, column="source_system_tab", name="stabs_features", db=db)
    stypes_features = get_features_from_redis(input_X, column="source_type", name="stypes_features", db=db)
    regs_features = get_features_from_redis(input_X, column="registered_via", name="regs_features", db=db)
    return music_predict(model,
                         [user_latent_features, song_latent_features, user_cluster_features, song_cluster_features,
                          artist_cluster_features, user_features, song_features, artist_features, genre_features,
                          city_features, ages_features, language_features, gender_features, composer_features,
                          lyrs_features, sns_features, stabs_features, stypes_features,
                          regs_features])

music_eval_pipeline(test_X.iloc[0:100], full_model)
music_eval_pipeline(test_X.iloc[0:100], full_model)
time_start = time.time()
preds = music_eval_pipeline(test_X, full_model)
time_elapsed = time.time() - time_start
print("Elapsed Time %fs Num Rows %d Throughput %f rows/sec" %
      (time_elapsed, len(test_X), len(test_X) / time_elapsed))

print("AUC Score: %f" % music_score(preds, test_y))