In [1]:
#set environment
import os
import sys
 
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/local/anaconda/bin/python" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/local/anaconda/bin/python"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

In [2]:
#import Sparksession driver
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("Regression on life expectancy") \
    .getOrCreate()

In [18]:
#Loading the data
df = spark.read.csv('data/LFED.csv',header=True,inferSchema=True)
df.show()

+---------------+---------------+-------------+-------+----------------------+----------+-------+----+-----------------+-----+-----------------+----------+--------+------------------+-----------+-------------------+------------------+-------------------------------+---------+----------------+-----------------+
|Life_expectancy|Adult_mortality|Infant_deaths|Alcohol|Percentage_expenditure|HepatitisB|Measles| BMI|Under_five_deaths|Polio|Total_expenditure|Diphtheria|HIV/AIDS|               GDP| Population|Thinness_1-19_years|Thinness_5-9_years|Income_composition_of_resources|Schooling|Status_Developed|Status_Developing|
+---------------+---------------+-------------+-------+----------------------+----------+-------+----+-----------------+-----+-----------------+----------+--------+------------------+-----------+-------------------+------------------+-------------------------------+---------+----------------+-----------------+
|           65.0|          263.0|           62|   0.01|         

In [19]:
df.show(5)

+---------------+---------------+-------------+-------+----------------------+----------+-------+----+-----------------+-----+-----------------+----------+--------+------------------+-----------+-------------------+------------------+-------------------------------+---------+----------------+-----------------+
|Life_expectancy|Adult_mortality|Infant_deaths|Alcohol|Percentage_expenditure|HepatitisB|Measles| BMI|Under_five_deaths|Polio|Total_expenditure|Diphtheria|HIV/AIDS|               GDP| Population|Thinness_1-19_years|Thinness_5-9_years|Income_composition_of_resources|Schooling|Status_Developed|Status_Developing|
+---------------+---------------+-------------+-------+----------------------+----------+-------+----+-----------------+-----+-----------------+----------+--------+------------------+-----------+-------------------+------------------+-------------------------------+---------+----------------+-----------------+
|           65.0|          263.0|           62|   0.01|         

In [20]:
df.printSchema()

root
 |-- Life_expectancy: double (nullable = true)
 |-- Adult_mortality: double (nullable = true)
 |-- Infant_deaths: integer (nullable = true)
 |-- Alcohol: double (nullable = true)
 |-- Percentage_expenditure: double (nullable = true)
 |-- HepatitisB: double (nullable = true)
 |-- Measles: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- Under_five_deaths: integer (nullable = true)
 |-- Polio: double (nullable = true)
 |-- Total_expenditure: double (nullable = true)
 |-- Diphtheria: double (nullable = true)
 |-- HIV/AIDS: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- Thinness_1-19_years: double (nullable = true)
 |-- Thinness_5-9_years: double (nullable = true)
 |-- Income_composition_of_resources: double (nullable = true)
 |-- Schooling: double (nullable = true)
 |-- Status_Developed: integer (nullable = true)
 |-- Status_Developing: integer (nullable = true)



In [21]:
#Check for missing values
for col in df.columns:
    print("no. of cells in column", col, "with null values:", df.filter(df[col].isNull()).count())

no. of cells in column Life_expectancy with null values: 0
no. of cells in column Adult_mortality with null values: 0
no. of cells in column Infant_deaths with null values: 0
no. of cells in column Alcohol with null values: 0
no. of cells in column Percentage_expenditure with null values: 0
no. of cells in column HepatitisB with null values: 0
no. of cells in column Measles with null values: 0
no. of cells in column BMI with null values: 0
no. of cells in column Under_five_deaths with null values: 0
no. of cells in column Polio with null values: 0
no. of cells in column Total_expenditure with null values: 0
no. of cells in column Diphtheria with null values: 0
no. of cells in column HIV/AIDS with null values: 0
no. of cells in column GDP with null values: 0
no. of cells in column Population with null values: 0
no. of cells in column Thinness_1-19_years with null values: 0
no. of cells in column Thinness_5-9_years with null values: 0
no. of cells in column Income_composition_of_resource

In [22]:
#all the independent variables need to be packed into one column of vector type
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Adult_mortality','Infant_deaths','Alcohol','Percentage_expenditure','HepatitisB','Measles','BMI','Under_five_deaths','Polio','Total_expenditure','Diphtheria','HIV/AIDS','GDP','Population','Thinness_1-19_years','Thinness_5-9_years','Income_composition_of_resources','Schooling','Status_Developed','Status_Developing'], 
                            outputCol="features")
feature_vec=assembler.transform(df).select('features','Life_expectancy')
feature_vec.show(5)

+--------------------+---------------+
|            features|Life_expectancy|
+--------------------+---------------+
|[263.0,62.0,0.01,...|           65.0|
|[271.0,64.0,0.01,...|           59.9|
|[268.0,66.0,0.01,...|           59.9|
|[272.0,69.0,0.01,...|           59.5|
|[275.0,71.0,0.01,...|           59.2|
+--------------------+---------------+
only showing top 5 rows



In [24]:
# Split the data into train and test sets
train_data, test_data = feature_vec.randomSplit([.75,.25],seed=0)

# Random Forest Regression

In [26]:
from pyspark.ml.regression import RandomForestRegressor
model = RandomForestRegressor(labelCol='Life_expectancy', featuresCol="features",  
                        maxDepth=15, minInfoGain=0.001, seed=0, numTrees=110)
rfModel = model.fit(train_data)

#Evaulation of the Model
predictions = rfModel.transform(test_data)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol='Life_expectancy',metricName='r2')
evaluator.evaluate(predictions)

0.962063706844703

# Hyper-paramater tuning

In [27]:
#Grid Search
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
model = RandomForestRegressor(labelCol='Life_expectancy', featuresCol="features",  
                        minInfoGain=0.001, seed=0)
paramGrid = (ParamGridBuilder()\
             .addGrid(model.maxDepth,[13,14,15])\
             .addGrid(model.numTrees,[100,110,120])\
             .build())

# Create 4-fold CrossValidator
cv = CrossValidator(estimator=model, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=4)

cvModel = cv.fit(train_data)

In [28]:
#Best Model Params
score_params_list = list(zip(cvModel.avgMetrics, cvModel.getEstimatorParamMaps()))
max(score_params_list,key=lambda item:item[0])

(0.9541684082316708,
 {Param(parent='RandomForestRegressor_4e07838b217be1075ed6', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 15,
  Param(parent='RandomForestRegressor_4e07838b217be1075ed6', name='numTrees', doc='Number of trees to train (>= 1).'): 100})