# SPARK DF LAB TUTORIALS
In this notebook I use my own data set to follow the INFOSYS lab tutorials

# Initial Setup


In [1]:
# Following code is taken from Infosys Labs
# Section must be included at the beginning of each new notebook. Remember to change the app name. 
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('missing').getOrCreate()

# data was taken from Machine Learning Repository url: http://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008
df = spark.read.csv('dataset/diabetic_data.csv',header=True,inferSchema=True)

# Spark DF Basics
 

In [None]:
# Check data and structure of data
df.show()

# check random row
print(df.head(1))

# Print schema allows us to visualise the data structure at a high level. 
df.printSchema()


In [None]:
# Use the describe method get some general statistics on our data too. 
# df.describe().show()

print('General patient stats')
patient_stats = df.select('race', 'gender', 'weight','time_in_hospital','num_lab_procedures','num_procedures','num_medications').describe()
patient_stats.show()

print('Stats of average patient visits for previous year')
# This will be a bit skewed, can be avoided by only selecting unique accounts
dstats = df.select('number_outpatient', 'number_emergency', 'number_inpatient').describe().show()


In [None]:
# Show Frequency and order

In [None]:
# identify age group by diabetes
m = 'age'
# show frequency
df.groupBy(m).count().orderBy('age').show()

# different order
df.groupBy(m).count().orderBy('count',ascending=0).show()

In [None]:
# find out the patients with the most number of records in the data set
m = 'patient_nbr'
# different order
df.groupBy(m).count().orderBy('count',ascending=0).show(10)

In [None]:
# identify patients with at least 20 record entries
df.groupBy(m).count().filter("count>20").orderBy('count',ascending=0).show()
print(df.groupBy(m).count().filter("count>20").orderBy('count',ascending=0).count())

In [None]:
# the patient with the highest entry
print('Patien ID:',88785891)
print('Caucasian, Female')

df.filter("patient_nbr=88785891").select('weight','time_in_hospital','num_lab_procedures','num_procedures','num_medications').show(10)

# interchangeable
# df.select('weight','time_in_hospital','num_lab_procedures','num_procedures','num_medications').filter("patient_nbr=88785891").show()


In [None]:
print('Patien ID:',88785891)
print('Caucasian, Female')

df.select('weight','time_in_hospital','num_lab_procedures','num_procedures','num_medications','readmitted').filter("patient_nbr=88785891").orderBy(['weight','time_in_hospital'],ascending=[0,0]).show()

In [None]:
print('Patien ID:',88785891)
print('Caucasian, Female')

df.filter("patient_nbr = 88785891").groupBy('time_in_hospital').count().orderBy('time_in_hospital').show()

In [None]:
# find out the patients with only one entry
single_entries=df.groupBy(m).count().filter("count=1")
print(single_entries.count())
single_entries.show()

In [None]:
from pyspark.sql.functions import format_number, col
x = 'age'
# ds = df.select('admission_type_id','time_in_hospital','number_emergency').show()
df.groupBy(x).count().orderBy(x).show()

### Cleaning up


In [None]:
from pyspark.sql.functions import format_number, col
# Let's start off with this. Just grouping by job and presenting the mean.
group_job_df = df.groupBy('age').mean()
group_job_df.show()


In [None]:
from pyspark.sql.functions import format_number, col
# Let's start off with this. Just grouping by job and presenting the mean.
group_job_df = df.groupBy('age').mean()
# group_job_df = group_job_df.select(col('age'),col('avg(time_in_hospital)'),col('avg(num_lab_procedures)'),col('avg(num_procedures)'))
# group_job_df.show()

group_job_df = group_job_df.select('age',col('avg(time_in_hospital)').alias('Days in Hospital'),format_number('avg(num_lab_procedures)',2),format_number('avg(num_procedures)',2))
group_job_df.show()


# Spark DF - Data Cleaning
Second part lets gp!

In [None]:
# Let's see how many rows of data we originally have.
print("Total data points:", df.count())

dropped_df= df.na.drop()
print("Total data points:", df.na.drop().count())


In [None]:
## how to get a value

In [None]:
from pyspark.sql.functions import mean, count
print('How to get the actual value')
hdays_count = df.select(count('time_in_hospital')).show()
hdays_count = df.select(count('time_in_hospital')).collect()
print(hdays_count)
hdays_count = hdays_count[0]
print(hdays_count)
hdays_count = hdays_count[0]
print(hdays_count)


hdays_mean = df.select(mean('time_in_hospital')).collect()
print(hdays_mean)
hdays_mean = hdays_mean[0]
print(hdays_mean)
hdays_mean = hdays_mean[0]
print(hdays_mean)

In [None]:
print("Finding number of Caucasians")

print('demographic')
r='race'
raceCount = df.groupBy(r).count()
raceCount.show()

raceCount = df.select(count(r)).show()
raceCount = df.filter("race = 'Caucasian'").select(count(r)).collect()
print(raceCount)
raceCount = raceCount[0]
print(raceCount)
raceCount = raceCount[0]
print(raceCount)

In [None]:
white = df.filter("race = 'Caucasian'").select(count(r)).collect()[0][0]
print('Caucasian count:',white)
print(white/hdays_count)

# Misc cod i will use laterr

In [None]:
# print the ranges of each attribute
def print_range(n):
    print(n,"count:",df.select(n).distinct().count(),"unique")
    df.groupBy(n).count().show()

for i in df.columns:
    print_range(i)

    
patients = df.groupBy('patient_nbr').count().show()

df.groupBy(['change','diabetesMed','readmitted']).count().orderBy('change').show()

In [5]:
df.groupBy('patient_nbr').count().orderBy('count', ascending=0).show()

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:44192)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:44192)