# PySpark Data Analysis Tutorial

In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('Spark DataFrames').getOrCreate()

crime = spark.read.csv(path='dbfs:/FileStore/crime.csv',header=True,inferSchema=True)
schools = spark.read.csv(path='dbfs:/FileStore/schools.csv',header=True,inferSchema=True)
college = spark.read.csv(path='dbfs:/FileStore/college.csv',header=True,inferSchema=True)
rankings = spark.read.csv(path='dbfs:/FileStore/rankings.csv',header=True,inferSchema=True)

In [0]:
crime.show(1)

+--------------+-------------------------------------+--------------------------------------+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|
+--------------+-------------------------------------+--------------------------------------+
|Albuquerque NM|                                965.8|                                6073.2|
+--------------+-------------------------------------+--------------------------------------+
only showing top 1 row



In [0]:
display(college.head(1))


Institution_Name,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"


In [0]:
display(schools.head(1))

School,Historically_Black,Men_Only,Women_Only
Alabama A & M University,1,0,0


In [0]:
collegedf = college.withColumnRenamed('Institution_Name','School')

In [0]:
display(collegedf.head(1))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"


## Join on 1 Column

In [0]:
joined = collegedf.join(schools,
              schools.School == collegedf.School,
              "left")

In [0]:
def show_number_columns( tbl1, tbl2, joinTbl):
    len1 = len(tbl1.columns)
    len2 =  len(tbl2.columns)
    len3 =  len(joinTbl.columns)
    print(f'Table 1 = {len1}\nTable 2 = {len2}\nJoined Table = {len3}\n')

In [0]:
show_number_columns(collegedf, schools, joined)

Table 1 = 57
Table 2 = 4
Joined Table = 61



In [0]:
display(joined.head(1))

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors,School.1,Historically_Black,Men_Only,Women_Only
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470,470,457,19,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667,,9366,17496,7028,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience",Alabama A & M University,1,0,0


## Join on Multiple Columns
**Problem:**  Here, crime has city and state in a single column, but I'd like to merge it with college, which has them separated
* **Solution 1:**  Concatenate city and state with the collegedf
* **Solution 2:**  Split the columns in crime and join on multiple columns

This can bee done either way, but we'll focus on solution 2 since this will be more useful in the future

In [0]:
crime.show(1)

+--------------+-------------------------------------+--------------------------------------+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|
+--------------+-------------------------------------+--------------------------------------+
|Albuquerque NM|                                965.8|                                6073.2|
+--------------+-------------------------------------+--------------------------------------+
only showing top 1 row



In [0]:
# split column
import pyspark.sql.functions as f

(
crime
 .withColumn("City", f.split(f.col("City State"), " ").getItem(0))
 .withColumn("State", f.split(f.col("City State"), " ").getItem(1)).show(2) 
)


+--------------+-------------------------------------+--------------------------------------+-----------+-----+
|    City State|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|       City|State|
+--------------+-------------------------------------+--------------------------------------+-----------+-----+
|Albuquerque NM|                                965.8|                                6073.2|Albuquerque|   NM|
|    Anaheim CA|                                363.7|                                2872.3|    Anaheim|   CA|
+--------------+-------------------------------------+--------------------------------------+-----------+-----+
only showing top 2 rows



In [0]:
crimedf = (
    crime
    .withColumn("cities", f.regexp_extract(f.col('City State'), '(.*) [a-zA-Z]{2}$', 1))  # remove last 2 letters and space between city and state
    .withColumn('states', f.col('City State').substr(-2,2))  # extract last 2 letters of the City State column
)

In [0]:
# show 1st 5 unique cities to ensure they are split out correctly
crimedf.select('cities').distinct().collect()[0:5]

Out[50]: [Row(cities='Tyler'),
 Row(cities='Worcester'),
 Row(cities='Charleston'),
 Row(cities='Corona'),
 Row(cities='Springfield')]

In [0]:
# drop the city state column for simplicity
crimedf = crimedf.drop(f.col('City State'))
crimedf.show(2)

+-------------------------------------+--------------------------------------+-----------+------+
|Violent Crime Rate Per 100,000 People|Property Crime Rate Per 100,000 People|     cities|states|
+-------------------------------------+--------------------------------------+-----------+------+
|                                965.8|                                6073.2|Albuquerque|    NM|
|                                363.7|                                2872.3|    Anaheim|    CA|
+-------------------------------------+--------------------------------------+-----------+------+
only showing top 2 rows



In [0]:
display(collegedf)

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors
Alabama A & M University,Normal,AL,Normal AL,35762,www.aamu.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.783368,-86.568502,,0.8738,470.0,470.0,457.0,19.0,4616,0.0256,0.9129,0.0076,0.0019,0.0024,0.0017,0.0401,0.0065,0.0013,0.0877,22667.0,,9366,17496,7028.0,0.7354,0.2749,,0.5769,,0.3091,,0.336676218,0.111891892,0.563870968,0.286221591,0.602008788,0.016422083,0.34940601,0.634171908,0.564030132,0.009102323,0.003138732,0.365828092,4-year,12/12/1965,"Computer Engineering, Aviation, Neuroscience"
University of Alabama at Birmingham,Birmingham,AL,Birmingham AL,35294-0110,www.uab.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",33.505697,-86.799345,,0.5814,640.0,660.0,,28.0,12047,0.5786,0.2626,0.0309,0.0598,0.0028,0.0004,0.0387,0.0179,0.0083,0.2578,22684.0,,8040,18368,10517.0,0.8918,0.5309,,0.8161,,0.4286,,0.574557316,0.346713892,0.74059293,0.548339118,0.427613192,0.0222668,0.318956871,0.658776329,0.63909074,0.105086641,0.003167505,0.341223671,4-year,12/1/1965,"Medieval and Renaissance Studies, Computer and Information Science, Nursing"
Amridge University,Montgomery,AL,Montgomery AL,36117-3553,www.amridgeuniversity.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",32.362609,-86.17401,Churches of Christ,,,,,,293,0.157,0.2355,0.0068,0.0,0.0,0.0034,0.0,0.0,0.5973,0.5392,13380.0,,7180,7180,3857.0,1.0,0.25,,0.0909,,0.0,,0.333333333,0.144208038,0.679245283,0.375,0.773648649,0.05,0.4625,0.4875,0.648648649,0.236486487,0.040540541,0.5125,4-year,3/26/1987,"Journalism, English, Business Administration/Management"
University of Alabama in Huntsville,Huntsville,AL,Huntsville AL,35899,www.uah.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.724557,-86.640449,,0.7628,660.0,680.0,,31.0,6346,0.7148,0.1131,0.0411,0.0414,0.012,0.0,0.0181,0.0303,0.0292,0.1746,22059.0,,9842,20612,9463.0,0.6374,0.4867,,0.8255,,0.5455,,0.640888889,0.362745098,0.816953317,0.513170732,0.374633738,0.019823789,0.29030837,0.689867841,0.476349937,0.100460444,PrivacySuppressed,0.310132159,4-year,12/1/1965,"Islamic Studies, Applied Mathematics, Architecture"
Alabama State University,Montgomery,AL,Montgomery AL,36104-0271,www.alasu.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",32.364317,-86.295677,,0.459,460.0,460.0,,19.0,4704,0.0138,0.9337,0.0111,0.0028,0.0013,0.0004,0.0111,0.0159,0.01,0.0727,19242.0,,9220,16156,7952.0,0.6368,0.2165,,0.6262,,0.3846,,0.244680851,0.10538201,0.491967872,0.290640394,0.614616613,0.01882461,0.324609734,0.656565657,0.61341853,0.00798722,PrivacySuppressed,0.343434343,4-year,12/1/1965,"Biology, Psychology, Neuroscience"
The University of Alabama,Tuscaloosa,AL,Tuscaloosa AL,35487-0166,www.ua.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Small (population less than 100,000)",33.211875,-87.545978,,0.5259,610.0,620.0,600.0,31.0,31663,0.7841,0.1037,0.0437,0.0118,0.0036,0.0009,0.0297,0.0192,0.0033,0.0819,28422.0,,10470,26950,9802.0,0.7378,0.6871,,0.8627,,0.4,,0.657238365,0.415342298,0.804500703,0.558691207,0.261546724,0.008160984,0.217551705,0.774287311,0.615252417,0.053705693,0.003974221,0.225712689,4-year,12/1/1965,"Chemical Engineering, Applied Mathematics, Aerospace Engineering"
Central Alabama Community College,Alexander City,AL,Alexander City AL,35010,www.cacc.edu,Yes,Predominantly associate's-degree granting,Associate degree,Public,Town: Distant (in urban cluster more than 10 miles and up to 35 miles from an urbanized area),32.92478,-85.945266,,,,,,,1492,0.6877,0.2802,0.0127,0.002,0.004,0.0007,0.0067,0.002,0.004,0.3733,13868.0,,4380,7890,5960.0,0.4623,,0.2153,,0.6173,,0.3469,0.27027027,0.187068966,0.5,0.413105413,0.607476636,0.049676026,0.5,0.450323974,0.603738318,0.096261682,PrivacySuppressed,0.549676026,2-year,2/14/1969,"Finance, Latin American Studies, Islamic Studies"
Auburn University at Montgomery,Montgomery,AL,Montgomery AL,36117-3596,www.aum.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",32.36736,-86.177544,,0.7659,495.0,495.0,,24.0,4171,0.5126,0.3627,0.0141,0.0247,0.006,0.001,0.0319,0.0412,0.0058,0.2592,19255.0,,9640,20710,7251.0,0.9585,0.2261,,0.6667,,0.4808,,0.58493353,0.297530864,0.711256118,0.447007931,0.489226249,0.023031602,0.358864489,0.61810391,0.69294809,0.072967679,0.00489716,0.38189609,4-year,1/1/1968,"Electrical Engineering, Epidemiology, Philosophy"
Auburn University,Auburn,AL,Auburn AL,36849,www.auburn.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Small (population less than 100,000)",32.599378,-85.488258,,0.8054,620.0,640.0,620.0,30.0,22095,0.8285,0.0673,0.0335,0.0252,0.0052,0.0003,0.0128,0.0214,0.0059,0.0831,29794.0,,10696,28840,9945.0,0.8673,0.7501,,0.9101,,0.75,,0.769965278,0.607278241,0.843283582,0.797552837,0.252037368,0.006561411,0.166085708,0.827352881,0.531504671,0.022261976,0.002385212,0.172647119,4-year,9/8/1987,"Psychology, Theology, English"
Birmingham Southern College,Birmingham,AL,Birmingham AL,35254,www.bsc.edu/,Yes,Predominantly bachelor's-degree granting,Bachelor's degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",33.513774,-86.850552,United Methodist,0.4839,610.0,570.0,,28.0,1289,0.7921,0.1171,0.0217,0.0489,0.007,0.0,0.0109,0.0,0.0023,0.0054,48510.0,,34448,34448,7233.0,0.6226,0.6821,,0.8209,,,,0.767955801,0.654471545,PrivacySuppressed,PrivacySuppressed,0.235294118,PrivacySuppressed,PrivacySuppressed,0.827505828,0.520361991,PrivacySuppressed,PrivacySuppressed,0.172494173,4-year,8/23/1976,"Neuroscience, Architecture, Nutrition"


### Join College and Crime DataFrames

In [0]:
multi_join = collegedf.join(crimedf,
              (collegedf.City == crimedf.cities) &
               (collegedf.State == crimedf.states)
              )
display(multi_join)

School,City,State,City State,Zip,URL,Main_Campus,Predominant_Ugrad_Deg,Highest_Deg,Control,Locale,Lat,Long,Religious_Affiliation,Adm_Rate,SAT_R_75,SAT_M_75,SAT_W_75,ACT_CUM_75,Undergrad_Enrollment,Percent_White,Percent_Black,Percent_Hisp,Percent_Asian,Percent_AIAN,Percent_NHPI,Percent_2OrMore,Percent_NRA,Percent_UNKN,Percent_Part-time,Avg_Cost_Academic_Year,Avg_Cost_Program_Year,In-state_Tuition,Out-of-state_Tuition,Avg_Fac_Sal,Percent_Full-time_Fac,CompletionRate_150_4,CompletionRate_150_L4,RetentionRate_FT4,RetentionRate_FTL4,RetentionRate_PT4,RetentionRate_PTL4,Compl_Repay_1yr_Rate,Noncom_Repay_1yr_Rate,Compl_Repay_7yr_Rate,Noncom_Repay_7yr_Rate,Low_Inc_Aid,Parent_Ed_MS,Parent_Ed_HS,Parent_Ed_PS,Percent_Female,Percent_Male,Percent_Veterans,Percent_First_Gen,Level of institution,TIV_Approval_Date,Top3Majors,"Violent Crime Rate Per 100,000 People","Property Crime Rate Per 100,000 People",cities,states
University of Alabama at Birmingham,Birmingham,AL,Birmingham AL,35294-0110,www.uab.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",33.505697,-86.799345,,0.5814,640.0,660.0,,28.0,12047,0.5786,0.2626,0.0309,0.0598,0.0028,0.0004,0.0387,0.0179,0.0083,0.2578,22684.0,,8040,18368,10517.0,0.8918,0.5309,,0.8161,,0.4286,,0.574557316,0.346713892,0.74059293,0.548339118,0.427613192,0.0222668,0.318956871,0.658776329,0.63909074,0.105086641,0.003167505,0.341223671,4-year,12/1/1965,"Medieval and Renaissance Studies, Computer and Information Science, Nursing",1517.8,6934.1,Birmingham,AL
Amridge University,Montgomery,AL,Montgomery AL,36117-3553,www.amridgeuniversity.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",32.362609,-86.17401,Churches of Christ,,,,,,293,0.157,0.2355,0.0068,0.0,0.0,0.0034,0.0,0.0,0.5973,0.5392,13380.0,,7180,7180,3857.0,1.0,0.25,,0.0909,,0.0,,0.333333333,0.144208038,0.679245283,0.375,0.773648649,0.05,0.4625,0.4875,0.648648649,0.236486487,0.040540541,0.5125,4-year,3/26/1987,"Journalism, English, Business Administration/Management",395.7,5768.9,Montgomery,AL
University of Alabama in Huntsville,Huntsville,AL,Huntsville AL,35899,www.uah.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",34.724557,-86.640449,,0.7628,660.0,680.0,,31.0,6346,0.7148,0.1131,0.0411,0.0414,0.012,0.0,0.0181,0.0303,0.0292,0.1746,22059.0,,9842,20612,9463.0,0.6374,0.4867,,0.8255,,0.5455,,0.640888889,0.362745098,0.816953317,0.513170732,0.374633738,0.019823789,0.29030837,0.689867841,0.476349937,0.100460444,PrivacySuppressed,0.310132159,4-year,12/1/1965,"Islamic Studies, Applied Mathematics, Architecture",923.3,5041.6,Huntsville,AL
Alabama State University,Montgomery,AL,Montgomery AL,36104-0271,www.alasu.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",32.364317,-86.295677,,0.459,460.0,460.0,,19.0,4704,0.0138,0.9337,0.0111,0.0028,0.0013,0.0004,0.0111,0.0159,0.01,0.0727,19242.0,,9220,16156,7952.0,0.6368,0.2165,,0.6262,,0.3846,,0.244680851,0.10538201,0.491967872,0.290640394,0.614616613,0.01882461,0.324609734,0.656565657,0.61341853,0.00798722,PrivacySuppressed,0.343434343,4-year,12/1/1965,"Biology, Psychology, Neuroscience",395.7,5768.9,Montgomery,AL
The University of Alabama,Tuscaloosa,AL,Tuscaloosa AL,35487-0166,www.ua.edu/,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Small (population less than 100,000)",33.211875,-87.545978,,0.5259,610.0,620.0,600.0,31.0,31663,0.7841,0.1037,0.0437,0.0118,0.0036,0.0009,0.0297,0.0192,0.0033,0.0819,28422.0,,10470,26950,9802.0,0.7378,0.6871,,0.8627,,0.4,,0.657238365,0.415342298,0.804500703,0.558691207,0.261546724,0.008160984,0.217551705,0.774287311,0.615252417,0.053705693,0.003974221,0.225712689,4-year,12/1/1965,"Chemical Engineering, Applied Mathematics, Aerospace Engineering",519.7,4728.6,Tuscaloosa,AL
Auburn University at Montgomery,Montgomery,AL,Montgomery AL,36117-3596,www.aum.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Midsize (population of at least 100,000 but less than 250,000)",32.36736,-86.177544,,0.7659,495.0,495.0,,24.0,4171,0.5126,0.3627,0.0141,0.0247,0.006,0.001,0.0319,0.0412,0.0058,0.2592,19255.0,,9640,20710,7251.0,0.9585,0.2261,,0.6667,,0.4808,,0.58493353,0.297530864,0.711256118,0.447007931,0.489226249,0.023031602,0.358864489,0.61810391,0.69294809,0.072967679,0.00489716,0.38189609,4-year,1/1/1968,"Electrical Engineering, Epidemiology, Philosophy",395.7,5768.9,Montgomery,AL
Auburn University,Auburn,AL,Auburn AL,36849,www.auburn.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Public,"City: Small (population less than 100,000)",32.599378,-85.488258,,0.8054,620.0,640.0,620.0,30.0,22095,0.8285,0.0673,0.0335,0.0252,0.0052,0.0003,0.0128,0.0214,0.0059,0.0831,29794.0,,10696,28840,9945.0,0.8673,0.7501,,0.9101,,0.75,,0.769965278,0.607278241,0.843283582,0.797552837,0.252037368,0.006561411,0.166085708,0.827352881,0.531504671,0.022261976,0.002385212,0.172647119,4-year,9/8/1987,"Psychology, Theology, English",240.9,3502.8,Auburn,AL
Birmingham Southern College,Birmingham,AL,Birmingham AL,35254,www.bsc.edu/,Yes,Predominantly bachelor's-degree granting,Bachelor's degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",33.513774,-86.850552,United Methodist,0.4839,610.0,570.0,,28.0,1289,0.7921,0.1171,0.0217,0.0489,0.007,0.0,0.0109,0.0,0.0023,0.0054,48510.0,,34448,34448,7233.0,0.6226,0.6821,,0.8209,,,,0.767955801,0.654471545,PrivacySuppressed,PrivacySuppressed,0.235294118,PrivacySuppressed,PrivacySuppressed,0.827505828,0.520361991,PrivacySuppressed,PrivacySuppressed,0.172494173,4-year,8/23/1976,"Neuroscience, Architecture, Nutrition",1517.8,6934.1,Birmingham,AL
South University-Montgomery,Montgomery,AL,Montgomery AL,36116,"www.southuniversity.edu/montgomery#location=Montgomery,%20AL",No,Predominantly bachelor's-degree granting,Graduate degree,Private for-profit,"City: Midsize (population of at least 100,000 but less than 250,000)",32.342684,-86.216488,,,,,,,394,0.297,0.6447,0.0203,0.0102,0.0025,0.0102,0.0,0.0,0.0152,0.3959,26892.0,,17306,17306,4887.0,0.2125,0.075,,0.25,,0.0,,0.386992275,0.156818496,0.60945946,0.308683645,0.656954488,0.051853111,0.420435226,0.527711663,0.762866865,0.236707784,0.016234227,0.472288337,4-year,5/8/1998,"Biomedical Engineering, Aviation, Photography",395.7,5768.9,Montgomery,AL
Faulkner University,Montgomery,AL,Montgomery AL,36109-3390,www.faulkner.edu,Yes,Predominantly bachelor's-degree granting,Graduate degree,Private nonprofit,"City: Midsize (population of at least 100,000 but less than 250,000)",32.384181,-86.21641,Churches of Christ,0.4538,570.0,550.0,510.0,23.0,2297,0.4101,0.5037,0.0222,0.0044,0.0048,0.0026,0.02,0.0244,0.0078,0.2255,30557.0,,20130,20130,5723.0,0.4647,0.3348,,0.5448,,0.5294,,0.350850077,0.198701299,0.566666667,0.485641026,0.574178935,0.031309904,0.398722045,0.569968051,0.638731597,0.167610419,PrivacySuppressed,0.430031949,4-year,12/2/1968,"Psychology, Civil Engineering, Culinary Arts",395.7,5768.9,Montgomery,AL


In [0]:
show_number_columns(collegedf, crimedf, multi_join)

Table 1 = 57
Table 2 = 4
Joined Table = 61



In [0]:
# select some columns to shrink the data set
multi_join = multi_join.select('State', 'City', 'School', 'In-state_Tuition', 'Avg_Fac_Sal', 
                               f.col('Violent Crime Rate Per 100,000 People').alias('violent_crime_100k'), 
                               f.col('Property Crime Rate Per 100,000 People').alias('property_crime_100k')  )

In [0]:
multi_join.show(1)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 1 row



## Write to Parquet

In [0]:
outPath = 'dbfs:/FileStore/college_crime'



In [0]:
( 
    multi_join
     .write
    .option('compression', 'snappy')
    .mode('overwrite')
    .parquet(outPath)
)

In [0]:
display(
    dbutils.fs.ls(outPath)
)

path,name,size,modificationTime
dbfs:/FileStore/college_crime/_SUCCESS,_SUCCESS,0,1665515585000
dbfs:/FileStore/college_crime/_committed_2752579953311011263,_committed_2752579953311011263,220,1665515584000
dbfs:/FileStore/college_crime/_committed_893389491160514121,_committed_893389491160514121,111,1665515545000
dbfs:/FileStore/college_crime/_started_2752579953311011263,_started_2752579953311011263,0,1665515581000
dbfs:/FileStore/college_crime/_started_893389491160514121,_started_893389491160514121,0,1665515544000
dbfs:/FileStore/college_crime/part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet,part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet,45624,1665515583000


#### Read in data from parquet

In [0]:
# a parquet is basically a stored dataframe**
readTbl = (
  spark
  .read.parquet("dbfs:/FileStore/college_crime/part-00000-tid-2752579953311011263-06bfe378-5e59-4427-b40d-056081f4e238-18-1-c000.snappy.parquet")
  )

In [0]:
display(readTbl.head(2))

State,City,School,In-state_Tuition,Avg_Fac_Sal,violent_crime_100k,property_crime_100k
AL,Birmingham,University of Alabama at Birmingham,8040,10517,1517.8,6934.1
AL,Montgomery,Amridge University,7180,3857,395.7,5768.9


## Save to Table

In [0]:
# write to table
multi_join.write.mode('overwrite').saveAsTable('college_crime_data')

#### Read From Table

In [0]:
%sql
SELECT DISTINCT STATE, CITY, SCHOOL
FROM college_crime_data
ORDER BY STATE ASC

STATE,CITY,SCHOOL
AK,Anchorage,University of Alaska Anchorage
AK,Anchorage,Alaska Pacific University
AL,Huntsville,University of Alabama in Huntsville
AL,Montgomery,Huntingdon College
AL,Montgomery,Auburn University at Montgomery
AL,Tuscaloosa,The University of Alabama
AL,Mobile,University of South Alabama
AL,Huntsville,Oakwood University
AL,Birmingham,Jefferson State Community College
AL,Birmingham,Samford University


## Save to csv

In [0]:
(
    multi_join
    .coalesce(1)
    .write.save(path=outPath + "/results.csv", format='csv', mode='overwrite', sep='\t', header=True)
)

In [0]:
( 
    spark
    .read
    .options(format='csv', sep='\t', header=True)
    .csv(f'{outPath}/results.csv')
).show(5)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
|   AL|Montgomery|  Amridge University|            7180|       3857|             395.7|             5768.9|
|   AL|Huntsville|University of Ala...|            9842|       9463|             923.3|             5041.6|
|   AL|Montgomery|Alabama State Uni...|            9220|       7952|             395.7|             5768.9|
|   AL|Tuscaloosa|The University of...|           10470|       9802|             519.7|             4728.6|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 5 rows



In [0]:
#from pyspark.sql import DataFrameWriter
csvFile = f'{outPath}/college_crime_results.csv'

multi_join.repartition(1).write.csv(path=csvFile, mode='overwrite', header=True)

In [0]:
csv_df = spark.read.csv(csvFile, header=True)
csv_df.show(4)

+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|State|      City|              School|In-state_Tuition|Avg_Fac_Sal|violent_crime_100k|property_crime_100k|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
|   AL|Birmingham|University of Ala...|            8040|      10517|            1517.8|             6934.1|
|   AL|Montgomery|  Amridge University|            7180|       3857|             395.7|             5768.9|
|   AL|Huntsville|University of Ala...|            9842|       9463|             923.3|             5041.6|
|   AL|Montgomery|Alabama State Uni...|            9220|       7952|             395.7|             5768.9|
+-----+----------+--------------------+----------------+-----------+------------------+-------------------+
only showing top 4 rows



In [0]:
display(csv_df.head(4))

State,City,School,In-state_Tuition,Avg_Fac_Sal,violent_crime_100k,property_crime_100k
AL,Birmingham,University of Alabama at Birmingham,8040,10517,1517.8,6934.1
AL,Montgomery,Amridge University,7180,3857,395.7,5768.9
AL,Huntsville,University of Alabama in Huntsville,9842,9463,923.3,5041.6
AL,Montgomery,Alabama State University,9220,7952,395.7,5768.9
