##### Available data in raw form

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS demo
    LOCATION "dbfs:/FileStore/code-templates/delta_data"

In [0]:
%sql
show databases;

databaseName
default
demo


In [0]:
%sql
show tables from demo;
-- select * from demo.titanic_raw;
-- select * from demo.his_tvr;

database,tableName,isTemporary
demo,boston,False
demo,cancer,False
demo,his_tvr,False
demo,titanic_raw,False


In [0]:
%sql
-- DROP TABLE IF EXISTS demo.his_tvr

In [0]:
# Removing data of a table
# dbutils.fs.rm('dbfs:/FileStore/code-templates/delta_data/eda_data',recurse=True)

In [0]:
dbutils.fs.ls("dbfs:/FileStore/code-templates/data")

In [0]:
# Imports
import pandas as pd

##### AutoML
* Sample data for AutoML example
* dbfs:/FileStore/code-templates/data/Titanic_Raw.csv

In [0]:
# Reading in Pandas to preserve datatypes
automl_data_pd = pd.read_csv("/dbfs/FileStore/code-templates/data/Titanic_Raw.csv")
automl_data = spark.createDataFrame(automl_data_pd)
automl_data.write.format("delta").save("dbfs:/FileStore/code-templates/delta_data/automl_data")
spark.sql("CREATE TABLE demo.Titanic_Raw USING DELTA LOCATION 'dbfs:/FileStore/code-templates/delta_data/automl_data'")

##### EDA
* Sample data for EDA example
* dbfs:/FileStore/code-templates/data/his_tvr.csv

In [0]:
# Reading in Pandas to preserve datatypes
eda_data_pd = pd.read_csv("/dbfs/FileStore/code-templates/data/his_tvr.csv", low_memory=False)
eda_data_pd = eda_data_pd.drop(columns = ['Unnamed: 0', 'Rch´000 {Av(Wg)}', 'user_rating'])
eda_data_pd["release_date"] = pd.to_datetime(eda_data_pd['release_date'])
eda_data_pd["Date"] = pd.to_datetime(eda_data_pd['Date'])
eda_data = spark.createDataFrame(eda_data_pd)

# Spaces are not allowed in delta table header names
for col in eda_data.columns:
  if " " in col:
    eda_data = eda_data.withColumnRenamed(col, col.replace(" ", "_"))
eda_data.write.format("delta").mode("overwrite").save("dbfs:/FileStore/code-templates/delta_data/eda_data")
spark.sql("CREATE TABLE demo.his_tvr USING DELTA LOCATION 'dbfs:/FileStore/code-templates/delta_data/eda_data'")

##### Model Eval
* Sample data for Model_Eval example
* sklearn: load_boston, load_breast_cancer

In [0]:
from sklearn.datasets import load_boston, load_breast_cancer

## Cancer Data: Binary Classification
cancer_obj = load_breast_cancer()
cancer = pd.DataFrame(cancer_obj['data'], columns=cancer_obj['feature_names'])
cancer["target"] = cancer_obj['target']
cancer = spark.createDataFrame(cancer)

# Spaces are not allowed in delta table header names
for col in cancer.columns:
  if " " in col:
    cancer = cancer.withColumnRenamed(col, col.replace(" ", "_"))
cancer.write.format("delta").save("dbfs:/FileStore/code-templates/delta_data/cancer")
spark.sql("CREATE TABLE demo.cancer USING DELTA LOCATION 'dbfs:/FileStore/code-templates/delta_data/cancer'")

## Boston Data: Regression
boston_obj = load_boston()
boston = pd.DataFrame(boston_obj['data'], columns=boston_obj['feature_names'])
boston["target"] = boston_obj['target']
boston = spark.createDataFrame(boston)

# Spaces are not allowed in delta table header names
for col in boston.columns:
  if " " in col:
    boston = boston.withColumnRenamed(col, col.replace(" ", "_"))
boston.write.format("delta").save("dbfs:/FileStore/code-templates/delta_data/boston")
spark.sql("CREATE TABLE demo.boston USING DELTA LOCATION 'dbfs:/FileStore/code-templates/delta_data/boston'")

In [0]:
automl_data_pd.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
