In [None]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('exploratory_analysis').getOrCreate()
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
import pyspark.sql.functions as SparkFunctions
from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, NaiveBayes, DecisionTreeClassifier, LogisticRegression)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

cleaned_data = spark.read.format("csv").options(header="true").load("cleaned_data.csv")
cleaned_data.printSchema()
cleaned_data = cleaned_data.withColumn("_c0", cleaned_data["_c0"].cast(IntegerType()))
cleaned_data = cleaned_data.withColumn("year", cleaned_data["year"].cast(IntegerType()))
cleaned_data = cleaned_data.withColumn("age_group_code", cleaned_data["age_group_code"].cast(IntegerType()))
cleaned_data = cleaned_data.withColumn("average_weekly_income", cleaned_data["average_weekly_income"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("expenditure_per_person", cleaned_data["expenditure_per_person"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("average_sale_price", cleaned_data["average_sale_price"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("hpi", cleaned_data["hpi"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("cpi", cleaned_data["cpi"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("annual_savings", cleaned_data["annual_savings"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("downpayment_capacity", cleaned_data["downpayment_capacity"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("loan_to_value_ratio", cleaned_data["loan_to_value_ratio"].cast(FloatType()))
cleaned_data = cleaned_data.withColumn("affordability", cleaned_data["affordability"].cast(IntegerType()))
cleaned_data.printSchema()
cleaned_data.show()

region_code_indexer = StringIndexer(inputCol="region_code", outputCol="region_code_index")
regions_indexer = StringIndexer(inputCol="regions", outputCol="regions_index")

region_code_encoder = OneHotEncoder(inputCol='region_code_index',outputCol='region_code_vec')
regions_encoder = OneHotEncoder(inputCol='regions_index',outputCol='regions_vec')

indexers = [region_code_indexer, regions_indexer]
encoders = [region_code_encoder, regions_encoder]

# Combine all features into one vector named features.
assembler = VectorAssembler(
  inputCols=['_c0',
                     'year',
                     'region_code_vec',
                     'regions_vec',
                     'age_group_code',
                     'average_weekly_income',
                     'expenditure_per_person',
                     'average_sale_price',
                     'hpi',
                     'cpi',
                     'annual_savings',
                     'downpayment_capacity',
                     'loan_to_value_ratio',
                     'affordability'],
                     outputCol="features")

# Model instances
log_reg_model = LogisticRegression(featuresCol="features", labelCol="affordability", predictionCol="prediction")
des_tree_model = DecisionTreeClassifier(labelCol="affordability", featuresCol="features", predictionCol="prediction")
rand_for_model = RandomForestClassifier(labelCol="affordability", featuresCol="features", predictionCol="prediction")
#nav_bay_model = NaiveBayes(labelCol="affordability", featuresCol="features", predictionCol="prediction")
#cleaned_data.show()

# Create the model pipeline
pipeline_log_reg = Pipeline().setStages(indexers + encoders + [assembler, log_reg_model])
pipeline_des_tree = Pipeline().setStages(indexers + encoders + [assembler, des_tree_model])
pipeline_rand_for = Pipeline().setStages(indexers + encoders + [assembler, rand_for_model])
#pipeline_nav_bay = Pipeline().setStages(indexers + encoders + [assembler, nav_bay_model])

# Split the training and testing set.
train_data, test_data = cleaned_data.randomSplit([0.6,0.4])

fit_model_log_reg = pipeline_log_reg.fit(train_data)
results_log_reg = fit_model_log_reg.transform(test_data)
results_log_reg.select('affordability','prediction').show()

fit_model_des_tree = pipeline_des_tree.fit(train_data)
results_des_tree = fit_model_des_tree.transform(test_data)
results_des_tree.select('affordability','prediction').show()

fit_model_rand_for = pipeline_rand_for.fit(train_data)
results_rand_for = fit_model_rand_for.transform(test_data)
results_rand_for.select('affordability','prediction').show()

#fit_model_nav_bay = pipeline_nav_bay.fit(train_data)
#results_nav_bay = fit_model_nav_bay.transform(test_data)
#results_nav_bay.select('affordability','prediction', 'features').show()

eval_accuracy = MulticlassClassificationEvaluator(labelCol="affordability", predictionCol="prediction", metricName="accuracy")
eval_precision = MulticlassClassificationEvaluator(labelCol="affordability", predictionCol="prediction", metricName="weightedPrecision")
eval_recall = MulticlassClassificationEvaluator(labelCol="affordability", predictionCol="prediction", metricName="weightedRecall")
eval_f1 = MulticlassClassificationEvaluator(labelCol="affordability", predictionCol="prediction", metricName="f1")

accuracy_log_reg = eval_accuracy.evaluate(results_log_reg)
print("Logistic Regression Accuracy: " + str(accuracy_log_reg))
print("Logistic Regression Test Error: " + str(1.0 - accuracy_log_reg) + "\n")
accuracy_des_tree = eval_accuracy.evaluate(results_des_tree)
print("Decision Trees Accuracy: " + str(accuracy_des_tree))
print("Decision Trees Test Error: " + str(1.0 - accuracy_des_tree) + "\n")
accuracy_rand_for = eval_accuracy.evaluate(results_rand_for)
print("Random Forest Accuracy: " + str(accuracy_rand_for))
print("Random Forest Test Error: " + str(1.0 - accuracy_rand_for) + "\n")
#accuracy_nav_bay = eval_accuracy.evaluate(fit_model_nav_bay)
#print("Naive Bayes Accuracy: " + str(accuracy_nav_bay))
#print("Naive Bayes Test Error: " + str(1.0 - accuracy_nav_bay))

precision_log_reg = eval_precision.evaluate(results_log_reg)
print("Logistic Regression Precision: " + str(precision_log_reg))
precision_des_tree = eval_precision.evaluate(results_des_tree)
print("Decision Trees Precision: " + str(precision_des_tree))
precision_rand_for = eval_precision.evaluate(results_rand_for)
print("Random Forest Precision: " + str(precision_rand_for) + "\n")
#precision_nav_bay = eval_precision.evaluate(fit_model_nav_bay)
#print("Naive Bayes Precision: " + str(precision_nav_bay))

recall_log_reg = eval_recall.evaluate(results_log_reg)
print("Logistic Regression Recall: " + str(recall_log_reg))
recall_des_tree = eval_recall.evaluate(results_des_tree)
print("Decision Trees Recall: " + str(recall_des_tree))
recall_rand_for = eval_recall.evaluate(results_rand_for)
print("Random Forest Recall: " + str(recall_rand_for) + "\n")
#recall_nav_bay = eval_recall.evaluate(fit_model_nav_bay)
#print("Naive Bayes Recall: " + str(recall_nav_bay))

f1score_log_reg = eval_f1.evaluate(results_log_reg)
print("Logistic Regression F1Score: " + str(f1score_log_reg))
f1score_des_tree = eval_f1.evaluate(results_des_tree)
print("Decision Trees F1Score: " + str(f1score_des_tree))
f1score_rand_for = eval_f1.evaluate(results_rand_for)
print("Random Forest F1Score: " + str(f1score_rand_for) + "\n")
#f1score_nav_bay = eval_f1.evaluate(fit_model_nav_bay)
#print("Naive Bayes F1Score: " + str(f1score_nav_bay))

total_results_log_reg = results_log_reg.select('affordability','prediction')
correct_results_log_reg = total_results_log_reg.filter(total_results_log_reg['affordability'] == total_results_log_reg['prediction'])
count_records_log_reg = total_results_log_reg.count()
print("Number of Records for Logistic Regression: " + str(count_records_log_reg))
count_correct_records_log_reg = correct_results_log_reg.count()
print("Total Correct: " + str(count_correct_records_log_reg) + "\n")

total_results_des_tree = results_des_tree.select('affordability','prediction')
correct_results_des_tree = total_results_des_tree.filter(total_results_des_tree['affordability'] == total_results_des_tree['prediction'])
count_records_des_tree = total_results_des_tree.count()
print("Number of Records for Decision Trees: " + str(count_records_des_tree))
count_correct_records_des_tree = correct_results_des_tree.count()
print("Total Correct: " + str(count_correct_records_des_tree) + "\n")

total_results_rand_for = results_rand_for.select('affordability','prediction')
correct_results_rand_for = total_results_rand_for.filter(total_results_rand_for['affordability'] == total_results_rand_for['prediction'])
count_records_rand_for = total_results_rand_for.count()
print("Number of Records for Random Forest: " + str(count_records_rand_for))
count_correct_records_rand_for = correct_results_rand_for.count()
print("Total Correct: " + str(count_correct_records_rand_for) + "\n")

#total_results_nav_bay = results_nav_bay.select('affordability','prediction')
#correct_results_nav_bay = total_results_nav_bay.filter(total_results_nav_bay['affordability'] == total_results_nav_bay['prediction'])
#count_records_nav_bay = total_results_nav_bay.count()
#print("Number of Records for Naive Bayes: " + str(count_records_nav_bay))
#count_correct_records_nav_bay = correct_results_nav_bay.count()
#print("Total Correct: " + str(count_correct_records_nav_bay))
