# PySpark Data Analysis Tutorial

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Spark DataFrames').getOrCreate()

crime = spark.read.csv(path='dbfs:/FileStore/crime.csv',header=True,inferSchema=True)
schools = spark.read.csv(path='dbfs:/FileStore/schools.csv',header=True,inferSchema=True)
college = spark.read.csv(path='dbfs:/FileStore/college.csv',header=True,inferSchema=True)
rankings = spark.read.csv(path='dbfs:/FileStore/rankings.csv',header=True,inferSchema=True)

In [0]:
crime.show(1)

+--------------+-------------------------------------+--------------------------------------+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|
+--------------+-------------------------------------+--------------------------------------+
|Albuquerque NM|                                965.8|                                6073.2|
+--------------+-------------------------------------+--------------------------------------+
only showing top 1 row



In [0]:
display(college.head(1))


Institution_Name,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"


In [0]:
display(schools.head(1))

School,Historically_Black,Men_Only,Women_Only
Alabama A & M University,1,0,0


In [0]:
collegedf = college.withColumnRenamed('Institution_Name','School')

In [0]:
display(collegedf.head(1))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"


## Join on 1 Column

In [0]:
joined = collegedf.join(schools,
              schools.School == collegedf.School,
              "left")

In [0]:
def show_number_columns( tbl1, tbl2, joinTbl):
    len1 = len(tbl1.columns)
    len2 =  len(tbl2.columns)
    len3 =  len(joinTbl.columns)
    print(f'Table 1 = {len1}\nTable 2 = {len2}\nJoined Table = {len3}\n')

In [0]:
show_number_columns(collegedf, schools, joined)

Table 1 = 57
Table 2 = 4
Joined Table = 61



In [0]:
display(joined.head(1))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors,School.1,Historically_Black,Men_Only,Women_Only
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience",Alabama A & M University,1,0,0


## Join on Multiple Columns
**Problem:**  Here, crime has city and state in a single column, but I'd like to merge it with college, which has them separated
* **Solution 1:**  Concatenate city and state with the collegedf
* **Solution 2:**  Split the columns in crime and join on multiple columns

This can bee done either way, but we'll focus on solution 2 since this will be more useful in the future

In [0]:
crime.show(1)

+--------------+-------------------------------------+--------------------------------------+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|
+--------------+-------------------------------------+--------------------------------------+
|Albuquerque NM|                                965.8|                                6073.2|
+--------------+-------------------------------------+--------------------------------------+
only showing top 1 row



In [0]:
# split column
import pyspark.sql.functions as f

(
crime
 .withColumn("City", f.split(f.col("City State"), " ").getItem(0))
 .withColumn("State", f.split(f.col("City State"), " ").getItem(1)).show(2) 
)


+--------------+-------------------------------------+--------------------------------------+-----------+-----+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|       City|State|
+--------------+-------------------------------------+--------------------------------------+-----------+-----+
|Albuquerque NM|                                965.8|                                6073.2|Albuquerque|   NM|
|    Anaheim CA|                                363.7|                                2872.3|    Anaheim|   CA|
+--------------+-------------------------------------+--------------------------------------+-----------+-----+
only showing top 2 rows



In [0]:
crimedf = (
    crime
    .withColumn("cities", f.regexp_extract(f.col('City State'), '(.*) [a-zA-Z]{2}$', 1))  # remove last 2 letters and space between city and state
    .withColumn('states', f.col('City State').substr(-2,2))  # extract last 2 letters of the City State column
)

In [0]:
# show 1st 5 unique cities to ensure they are split out correctly
crimedf.select('cities').distinct().collect()[0:5]

Out[50]: [Row(cities='Tyler'),
 Row(cities='Worcester'),
 Row(cities='Charleston'),
 Row(cities='Corona'),
 Row(cities='Springfield')]

In [0]:
# drop the city state column for simplicity
crimedf = crimedf.drop(f.col('City State'))
crimedf.show(2)

+-------------------------------------+--------------------------------------+-----------+------+
|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|     cities|states|
+-------------------------------------+--------------------------------------+-----------+------+
|                                965.8|                                6073.2|Albuquerque|    NM|
|                                363.7|                                2872.3|    Anaheim|    CA|
+-------------------------------------+--------------------------------------+-----------+------+
only showing top 2 rows



In [0]:
display(collegedf.head(2))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457.0,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"
University of Alabama at Birmingham,Birmingham,AL,Birmingham AL,35294-0110,www.uab.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",33.505697,-86.799345,,0.5814,640,660,,28,12047,0.5786,0.2626,0.0309,0.0598,0.0028,0.0004,0.0387,0.0179,0.0083,0.2578,22684,,8040,18368,10517,0.8918,0.5309,,0.8161,,0.4286,,0.574557316,0.346713892,0.74059293,0.548339118,0.427613192,0.0222668,0.318956871,0.658776329,0.63909074,0.105086641,0.003167505,0.341223671,4-year,12/1/1965,"Medieval and Renaissance Studies, Computer and Information Science, Nursing"


### Join College and Crime DataFrames

In [0]:
multi_join = collegedf.join(crimedf,
              (collegedf.City == crimedf.cities) &
               (collegedf.State == crimedf.states)
              )
display(multi_join.head(2))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors,"Violent Crime Rate Per 100,000 People","Property Crime Rate Per 100,000 People",cities,states
University of Alabama at Birmingham,Birmingham,AL,Birmingham AL,35294-0110,www.uab.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",33.505697,-86.799345,,0.5814,640.0,660.0,,28.0,12047,0.5786,0.2626,0.0309,0.0598,0.0028,0.0004,0.0387,0.0179,0.0083,0.2578,22684,,8040,18368,10517,0.8918,0.5309,,0.8161,,0.4286,,0.574557316,0.346713892,0.74059293,0.548339118,0.427613192,0.0222668,0.318956871,0.658776329,0.63909074,0.105086641,0.003167505,0.341223671,4-year,12/1/1965,"Medieval and Renaissance Studies, Computer and Information Science, Nursing",1517.8,6934.1,Birmingham,AL
Amridge University,Montgomery,AL,Montgomery AL,36117-3553,www.amridgeuniversity.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",32.362609,-86.17401,Churches of Christ,,,,,,293,0.157,0.2355,0.0068,0.0,0.0,0.0034,0.0,0.0,0.5973,0.5392,13380,,7180,7180,3857,1.0,0.25,,0.0909,,0.0,,0.333333333,0.144208038,0.679245283,0.375,0.773648649,0.05,0.4625,0.4875,0.648648649,0.236486487,0.040540541,0.5125,4-year,3/26/1987,"Journalism, English, Business Administration/Management",395.7,5768.9,Montgomery,AL


In [0]:
show_number_columns(collegedf, crimedf, multi_join)

Table 1 = 57
Table 2 = 4
Joined Table = 61



In [0]:
# select some columns to shrink the data set
multi_join = multi_join.select('State', 'City', 'School', 'In-state_Tuition', 'Avg_Fac_Sal', 
                               f.col('Violent Crime Rate Per 100,000 People').alias('violent_crime_100k'), 
                               f.col('Property Crime Rate Per 100,000 People').alias('property_crime_100k')  )

In [0]:
multi_join.show(1)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 1 row



## Write to Parquet

In [0]:
outPath = 'dbfs:/FileStore/college_crime'



In [0]:
( 
    multi_join
     .write
    .option('compression', 'snappy')
    .mode('overwrite')
    .parquet(outPath)
)

In [0]:
display(
    dbutils.fs.ls(outPath)
)

path,name,size,modificationTime
dbfs:/FileStore/college_crime/_SUCCESS,_SUCCESS,0,1665515585000
dbfs:/FileStore/college_crime/_committed_2752579953311011263,_committed_2752579953311011263,220,1665515584000
dbfs:/FileStore/college_crime/_committed_893389491160514121,_committed_893389491160514121,111,1665515545000
dbfs:/FileStore/college_crime/_started_2752579953311011263,_started_2752579953311011263,0,1665515581000
dbfs:/FileStore/college_crime/_started_893389491160514121,_started_893389491160514121,0,1665515544000
dbfs:/FileStore/college_crime/part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet,part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet,45624,1665515583000


#### Read in data from parquet

In [0]:
# a parquet is basically a stored dataframe**
readTbl = (
  spark
  .read.parquet("dbfs:/FileStore/college_crime/part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet")
  )

In [0]:
display(readTbl.head(2))

State,City,School,In-state_Tuition,Avg_Fac_Sal,violent_crime_100k,property_crime_100k
AL,Birmingham,University of Alabama at Birmingham,8040,10517,1517.8,6934.1
AL,Montgomery,Amridge University,7180,3857,395.7,5768.9


## Save to Table

In [0]:
# write to table
multi_join.write.mode('overwrite').saveAsTable('college_crime_data')

#### Read From Table

In [0]:
%sql
SELECT DISTINCT STATE, CITY, SCHOOL
FROM college_crime_data
ORDER BY STATE ASC
LIMIT 3

STATE,CITY,SCHOOL
AK,Anchorage,University of Alaska Anchorage
AK,Anchorage,Alaska Pacific University
AL,Mobile,University of Mobile


## Save to csv

In [0]:
(
    multi_join
    .coalesce(1)
    .write.save(path=outPath + "/results.csv", format='csv', mode='overwrite', sep='\t', header=True)
)

In [0]:
( 
    spark
    .read
    .options(format='csv', sep='\t', header=True)
    .csv(f'{outPath}/results.csv')
).show(5)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
|   AL|Montgomery|  Amridge University|            7180|       3857|             395.7|             5768.9|
|   AL|Huntsville|University of Ala...|            9842|       9463|             923.3|             5041.6|
|   AL|Montgomery|Alabama State Uni...|            9220|       7952|             395.7|             5768.9|
|   AL|Tuscaloosa|The University of...|           10470|       9802|             519.7|             4728.6|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 5 rows



In [0]:
#from pyspark.sql import DataFrameWriter
csvFile = f'{outPath}/college_crime_results.csv'

multi_join.repartition(1).write.csv(path=csvFile, mode='overwrite', header=True)

In [0]:
csv_df = spark.read.csv(csvFile, header=True)
csv_df.show(4)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
|   AL|Montgomery|  Amridge University|            7180|       3857|             395.7|             5768.9|
|   AL|Huntsville|University of Ala...|            9842|       9463|             923.3|             5041.6|
|   AL|Montgomery|Alabama State Uni...|            9220|       7952|             395.7|             5768.9|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 4 rows



In [0]:
display(csv_df.head(4))

State,City,School,In-state_Tuition,Avg_Fac_Sal,violent_crime_100k,property_crime_100k
AL,Birmingham,University of Alabama at Birmingham,8040,10517,1517.8,6934.1
AL,Montgomery,Amridge University,7180,3857,395.7,5768.9
AL,Huntsville,University of Alabama in Huntsville,9842,9463,923.3,5041.6
AL,Montgomery,Alabama State University,9220,7952,395.7,5768.9
