# Load data to a dataframe
- Download AdultCensusIncome.csv from [here](https://amldockerdatasets.azureedge.net/AdultCensusIncome.csv).
- Create a diretory named /spark_ml. Upload AdultCensusIncome.csv to /spark_ml

In [1]:
import os
import pandas as pd

datafile = "/spark_ml/AdultCensusIncome.csv"

# Read and Load data
# Create a Spark dataframe out of the csv file.
data_all = spark.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)
print("({}, {})".format(data_all.count(), len(data_all.columns)))

#Replace "-" with "_" in column names
columns_new = [col.replace("-", "_") for col in data_all.columns]
data_all = data_all.toDF(*columns_new)
data_all.printSchema() #human-readable format

df = pd.DataFrame(data_all.take(10))
print(df.to_string())


(32561, 15)
root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)

   0                 1       2          3   4                      5                  6              7      8       9      10  11  12             13     14
0  39         State-gov   77516  Bachelors  13          Never-married       Adm-clerical  Not-in-family  White    Male   2174   0  40  United-States  <=50K
1  50  Self-emp-not-inc   83311  Bachelors 

# Data Preparation
- Choose the feature that we want to use to build the model
- Split the data set as training and test
- Write traning and data set as AdultCensusIncomeTrain and AdultCensusIncomeTest to /spark_ml directory



In [1]:
# Choose feature columns and the label column.
label = "income"
xvars = ["age", "hours_per_week"] #all numeric

print("label = {}".format(label))
print("features = {}".format(xvars))

select_cols = xvars
select_cols.append(label)
data = data_all.select(select_cols)

# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

print("train ({}, {})".format(train.count(), len(train.columns)))
print("test ({}, {})".format(test.count(), len(test.columns)))




label = income
features = ['age', 'hours_per_week']
train (24469, 3)
test (8092, 3)

# Data Persistence
- Save the training and test set as ORC data for persistance
- Will use the persisted data to build model


In [1]:
# Write the train and test data sets to intermediate storage
# Write the train and test data sets to intermediate storage
train_data_path = "/spark_ml/AdultCensusIncomeTrain"
test_data_path = "/spark_ml/AdultCensusIncomeTest"

train.write.mode('overwrite').orc(train_data_path)
test.write.mode('overwrite').orc(test_data_path)
print("train and test datasets saved to {} and {}".format(train_data_path, test_data_path))

train and test datasets saved to /spark_ml/AdultCensusIncomeTrain and /spark_ml/AdultCensusIncomeTest