Hello World!

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import sklearn
from lightgbm import LGBMClassifier
import redis
from willump.evaluation.willump_executor import willump_execute
if (os.getcwd().endswith("notebooks")):
    os.chdir("..")
sys.path.insert(0, os.path.abspath("tests/benchmark_scripts"))
print(os.getcwd())
import music_utils
from music_utils import get_features_from_redis

/mnt/c/Users/Peter/Documents/GitHub/Willump-Simple


In [6]:
db = redis.StrictRedis(host="localhost")
train_X, valid_X, train_y, valid_y = music_utils.load_music_dataset(db)
train_cascades_dict = {}

In [7]:
def music_train(y, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in FEATURES if f in X.columns]]
    model = LGBMClassifier(
        n_jobs=1,
        learning_rate=0.1,
        num_leaves=(2 ** 8),
        max_depth=15,
        metric="auc")
    model = model.fit(X, y)
    return model


def music_predict(model, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in FEATURES if f in X.columns]]
    if len(X) == 0:
        return np.zeros(0, dtype=np.float32)
    else:
        return model.predict(X)


def music_predict_proba(model, X_list):
    X = pd.concat(X_list, axis=1)
    X = X[[f for f in FEATURES if f in X.columns]]
    return model.predict_proba(X)[:, 1]


def music_score(true_y, pred_y):
    fpr, tpr, _ = metrics.roc_curve(true_y, pred_y, pos_label=1)
    return metrics.auc(fpr, tpr)

In [8]:
@willump_execute(train_function=music_train,
                 predict_function=music_predict,
                 predict_proba_function=music_predict_proba,
                 score_function=music_score,
                 train_cascades_dict=train_cascades_dict)
def music_train_pipeline(input_X, input_y):
    user_latent_features = get_features_from_redis(input_X, column="msno", name="features_uf", db=db)
    song_latent_features = get_features_from_redis(input_X, column="song_id", name="features_sf", db=db)
    user_cluster_features = get_features_from_redis(input_X, column="cluster_msno_25", name="uc_features", db=db)
    song_cluster_features = get_features_from_redis(input_X, column="cluster_song_id_25", name="sc_features", db=db)
    artist_cluster_features = get_features_from_redis(input_X, column="cluster_artist_name_25", name="ac_features", db=db)
    user_features = get_features_from_redis(input_X, column="msno", name="us_features", db=db)
    song_features = get_features_from_redis(input_X, column="song_id", name="ss_features", db=db)
    artist_features = get_features_from_redis(input_X, column="artist_name", name="as_features", db=db)
    genre_features = get_features_from_redis(input_X, column="genre_max", name="gs_features", db=db)
    city_features = get_features_from_redis(input_X, column="city", name="cs_features", db=db)
    ages_features = get_features_from_redis(input_X, column="bd", name="ages_features", db=db)
    language_features = get_features_from_redis(input_X, column="language", name="ls_features", db=db)
    gender_features = get_features_from_redis(input_X, column="gender", name="gender_features", db=db)
    composer_features = get_features_from_redis(input_X, column="composer", name="composer_features", db=db)
    lyrs_features = get_features_from_redis(input_X, column="lyricist", name="lyrs_features", db=db)
    sns_features = get_features_from_redis(input_X, column="source_screen_name", name="sns_features", db=db)
    stabs_features = get_features_from_redis(input_X, column="source_system_tab", name="stabs_features", db=db)
    stypes_features = get_features_from_redis(input_X, column="source_type", name="stypes_features", db=db)
    regs_features = get_features_from_redis(input_X, column="registered_via", name="regs_features", db=db)
    return music_train(input_y,
                       [user_latent_features, song_latent_features, user_cluster_features, song_cluster_features,
                        artist_cluster_features, user_features, song_features, artist_features, genre_features,
                        city_features, ages_features, language_features, gender_features, composer_features,
                        lyrs_features, sns_features, stabs_features, stypes_features,
                        regs_features])

In [9]:
train_X_features, feature_costs = music_utils.compute_features(train_X, db)
print(feature_costs)

{'user_latent_features': 1.0, 'song_latent_features': 1.0, 'user_cluster_features': 1.0, 'song_cluster_features': 1.0, 'artist_cluster_features': 1.0, 'user_features': 1.0, 'song_features': 1.0, 'artist_features': 1.0, 'genre_features': 1.0, 'city_features': 1.0, 'ages_features': 1.0, 'language_features': 1.0, 'gender_features': 1.0, 'composer_features': 1.0, 'lyrs_features': 1.0, 'sns_features': 1.0, 'stabs_features': 1.0, 'stypes_features': 1.0, 'regs_features': 1.0}
