## Demo

In [1]:
# Import Packages
import os
import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
import sys
import os
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import to_timestamp
import time
from pyspark.sql import functions as fn
from pyspark.ml import feature, regression, Pipeline

import datetime
from pyspark.sql.functions import year, month, dayofmonth, hour, minute, second
from pyspark.mllib.stat import Statistics
import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from pyspark.sql.types import FloatType
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName('demo').getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

from pyspark.ml import Pipeline
from pyspark.ml import feature
from pyspark.ml import classification
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
import ipywidgets
from ipywidgets import widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Function that Loads model based on user input

In [2]:
# Function to load the saved models in saved_models folder to make prediction on new data point
def predict(device_id_options,
    label_id_options,
    app_id_options,
    event_id_options,
    is_active_options,
    device_model_options,
    phone_brand_options,
    town_options,
    country_options,
    age_group_options,
    time_of_day_options,
    category_options,
    model):
    data = {'device_id': [device_id_options.value], 
        'label_id': [label_id_options.value],
        'app_id': [app_id_options.value],
        'event_id': [event_id_options.value],
        'is_active': [is_active_options.value],
        'device_model': [device_model_options.value],
        'phone_brand': [phone_brand_options.value],
        'town': [town_options.value],
        'country': [country_options.value],
        'age_group': [age_group_options.value],
        'time_of_day': [time_of_day_options.value],
        'category': [category_options.value]
       }

    df = pd.DataFrame(data)
    ddf = spark.createDataFrame(df)

    float_columns = ['device_id', 'app_id', 'label_id', 'event_id']
    int_columns = ['is_active']
    string_columns = ['phone_brand', 'device_model', 'town', 'country', 'category', 'time_of_day', 'age_group']

    ddf = ddf.select(*(col(c).cast("float").alias(c) for c in float_columns), \
                                                     *(col(c).cast("int").alias(c) for c in int_columns), \
                                                     *(col(c).alias(c) for c in string_columns))

    if model == 'logistic':
        model_load = Pipeline.load("../saved_models/logistic")
    if model == 'random_forest':
        model_load = Pipeline.load("../saved_models/random_forest")
    if model == 'gbt':
        model_load = Pipeline.load("../saved_models/GBT")
    
    train_data = spark.read.format('csv').option('header', 'true').load('../modeled_data/spark_data3.csv')

    prediction = model_load.fit(train_data).transform(ddf)

    return prediction.select('prediction').take(1)

In [3]:
# Load options
data = spark.read.format('csv').option('header', 'true').load('../modeled_data/spark_data3.csv')
data = data.toPandas()

device_id_options = widgets.Dropdown(description = "device_id", options = data.device_id.unique(), value = None)
label_id_options = widgets.Dropdown(description = "label_id", options = data.label_id.unique(), value = None)
app_id_options = widgets.Dropdown(description = "app_id", options = data.app_id.unique(), value = None)
event_id_options = widgets.Dropdown(description = "event_id", options = data.event_id.unique(), value = None)
is_active_options = widgets.Dropdown(description = "is_active", options = data.is_active.unique(), value = None)
device_model_options = widgets.Dropdown(description = "device_model", options = data.device_model.unique(), value = None)
phone_brand_options = widgets.Dropdown(description = "phone_brand", options = data.phone_brand.unique(), value = None)
town_options = widgets.Dropdown(description = "town", options = data.town.unique(), value = None)
country_options = widgets.Dropdown(description = "country", options = data.country.unique(), value = None)
age_group_options = widgets.Dropdown(description = "age_group", options = data.age_group.unique(), value = None)
time_of_day_options = widgets.Dropdown(description = "time_of_day", options = data.time_of_day.unique(), value = None)
category_options = widgets.Dropdown(description = "category", options = data.category.unique(), value = None)

## User Interface

In [4]:
device_id_options
label_id_options
app_id_options
event_id_options
is_active_options
device_model_options
phone_brand_options
town_options
country_options
age_group_options
time_of_day_options
category_options


from IPython.display import display
button1 = widgets.Button(description="Logistic Regression Prediction", button_style='danger')
display(button1)

button2 = widgets.Button(description="Random Forest Prediction", button_style='danger')
display(button2)

button3 = widgets.Button(description="Gradient Boosting Prediction", button_style='danger')
display(button3)

def logistic_regression_predict(b):
    value = predict(device_id_options,
    label_id_options,
    app_id_options,
    event_id_options,
    is_active_options,
    device_model_options,
    phone_brand_options,
    town_options,
    country_options,
    age_group_options,
    time_of_day_options,
    category_options,
    "logistic")
    
    gender = None
    
    if value[0]["prediction"] == 1.0:
        gender = "Female"
    else:
        gender = "Male"
    
    print('Logistic Regression Predicted Result for the selection is - ', gender)
        
def random_forest_predict(b):
    value = predict(device_id_options,
    label_id_options,
    app_id_options,
    event_id_options,
    is_active_options,
    device_model_options,
    phone_brand_options,
    town_options,
    country_options,
    age_group_options,
    time_of_day_options,
    category_options,
    "random_forest")
    
    gender = None
    
    if value[0]["prediction"] == 1.0:
        gender = "Female"
    else:
        gender = "Male"
    
    print('Random Forest Predicted Result for the selection is -  - ', gender)
        
        
def gradient_boost_predict(b):
    value = predict(device_id_options,
    label_id_options,
    app_id_options,
    event_id_options,
    is_active_options,
    device_model_options,
    phone_brand_options,
    town_options,
    country_options,
    age_group_options,
    time_of_day_options,
    category_options,
    "gbt")
    
    gender = None
    
    if value[0]["prediction"] == 1.0:
        gender = "Female"
    else:
        gender = "Male"

    
    print('Gradient Boost Predicted Result for the selection is - ', gender)

button1.on_click(random_forest_predict)
button2.on_click(logistic_regression_predict)
button3.on_click(gradient_boost_predict)


Dropdown(description='device_id', options=('-6754902882206380496', '5416618857406916680', '1698428484639625968…

Dropdown(description='label_id', options=('704', '209', '303', '172', '190', '730', '262', '548', '163', '302'…

Dropdown(description='app_id', options=('-145658454112781034', '2460654806659045896', '8693964245073640147', '…

Dropdown(description='event_id', options=('35963', '1773101', '37174', '801589', '401759', '2703917', '1975583…

Dropdown(description='is_active', options=('0', '1'), value=None)

Dropdown(description='device_model', options=('超级手机1', 'Galaxy S5', '荣耀6 Plus', 'X6 D', '红米1S', '荣耀7', 'MI 4',…

Dropdown(description='phone_brand', options=('lshi', 'samsung', 'Huawei', 'vivo', 'Xiaomi', 'meizu', 'Coolpad_…

Dropdown(description='town', options=('Chongqing', 'Henan', 'Shanxi', 'None', 'Hebei', 'Guangdong Sheng', 'Und…

Dropdown(description='country', options=('China', 'Undefined', 'Canada', 'Myanmar (Burma)', 'Kazakhstan', 'Aus…

Dropdown(description='age_group', options=('26-35', '22-26', '35-50', '59-90', '17-22', '50-59', '0-17'), valu…

Dropdown(description='time_of_day', options=('morning', 'evening', 'midnight', 'noon'), value=None)

Dropdown(description='category', options=('industry', 'social', 'other', 'travel', 'utilities', 'finance', 'ec…

Button(button_style='danger', description='Logistic Regression Prediction', style=ButtonStyle())

Button(button_style='danger', description='Random Forest Prediction', style=ButtonStyle())

Button(button_style='danger', description='Gradient Boosting Prediction', style=ButtonStyle())

Random Forest Predicted Result for the selection is -  -  Female
Logistic Regression Predicted Result for the selection is -  Male
Gradient Boost Predicted Result for the selection is -  Female
Random Forest Predicted Result for the selection is -  -  Female
Logistic Regression Predicted Result for the selection is -  Male
Gradient Boost Predicted Result for the selection is -  Female
