In [None]:
import os
os.environ['JDBC_HOST'] = 'jrtest01-splice-hregion'


In [None]:
# setup-- 
import os
import pyspark
from splicemachine.spark.context import PySpliceContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
jdbc_host = os.environ['JDBC_HOST']

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
'''jdbc:splice://{FRAMEWORKNAME}-proxy.marathon.mesos:1527/splicedb;user=splice;password=admin'''

splicejdbc=f'jdbc:splice://{jdbc_host}:1527/splicedb;user=splice;password=admin'

splice = PySpliceContext(spark, splicejdbc)


In [None]:
%%sql
%defaultDatasource jdbc:splice://jrtest01-splice-hregion:1527/splicedb;user=splice;password=admin

<link rel="stylesheet" href="https://doc.splicemachine.com/zeppelin/css/zepstyles2.css" />

# ETL Pipeline Example

This notebook presents a simple example of an ETL pipeline that reads a dataset from a public URL, performs simple transformations, and inserts data into a Splice Machine database.

### Extract
First we'll extract some data from a public URL, containing historical data about Telco customers and whether they discontinued service.  This is a good dataset for demonstrating machine learning algorithms for predicting customer churn:


In [None]:

import urllib2
from io import StringIO
import csv
from pyspark.sql.types import *

#get file
url = "https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_Fn-UseC_-Telco-Customer-Churn.csv?cm_mc_uid=93701979699314920973859&cm_mc_sid_50200000=1492097385&cm_mc_sid_52640000=1492097385"
response = urllib2.urlopen(url)
telcoText = csv.reader(response)


### Transform

Next we'll do some transformation of the data stream.  In this case we are doing some simple data cleansing and type conversions, and adding an additional column whose value is a function of other data in the existing columns:

In [None]:

#grab header and rows
header = telcoText.next()
header = [name.upper() for name in header]
#handle some missing values
rows = [row for row in telcoText if row[19] != ' ']
#convert SeniorCitizen to boolean
for i in range(len(rows)):
    rows[i][2] = bool(int(rows[i][2]))
    rows[i][5] = int(rows[i][5])
    rows[i][18] = float(rows[i][18])
    rows[i][19] = float(rows[i][19])

#get fields for schema
fields = []
for col in header:
    if col == 'SeniorCitizen':
        t = BooleanType()
    elif col == 'tenure':
        t = IntegerType()
    elif col == 'MonthlyCharges' or col=='TotalCharges':
        t == DoubleType()
    else:
        t = StringType()
    fields.append(StructField(col, t, True))
#create schema
schema = StructType(fields)

#create DF
telcoDF = sc.parallelize(rows).toDF(schema)

In [None]:


telcoDF.show()

### Load

Now that we have the data we want to load into the database, we'll define the schema and load it directly in using the Native Spark DataSource:

In [None]:
%%sql 

create table DS.etl_example(
        CUSTOMERID  varchar(150),
        gender varchar(50),
        SeniorCitizen boolean,
        Partner varchar(150),
        Dependents varchar(150),
        tenure int,
        PhoneService varchar(150),
        MultipleLines varchar(150),
        InternetService varchar(150),
        OnlineSecurity varchar(150),
        OnlineBackup varchar(150),
        DeviceProtection varchar(150),
        TechSupport varchar(150),
        StreamingTV varchar(150),
        StreamingMovies varchar(150),
        Contract varchar(150),
        PaperlessBilling varchar(150),
        PaymentMethod varchar(150),
        MonthlyCharges double,
        TotalCharges double,
        Churn varchar(150));

In [None]:


splice = PySpliceContext('jdbc:splice://localhost:1527/splicedb;user=splice;password=admin', sqlContext)

splice.insert(telcoDF, 'DS.etl_example')

In [None]:
%%sql 
select * from DS.etl_example;

## Where to Go Next
To complete this class, please complete the exercises in the  [*Exercises for This Class*](./j.%20Exercises.ipynb) notebook.
