## Working on Reading, Writing, and validating Data using PySpark

In [1]:
#Importing pyspark and creating a pySpark session
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadWriteVal").getOrCreate()
spark

In [2]:
#To know the number of cores I'm working with
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("Number of cores:", cores)

Number of cores: 1


In [4]:
#Importing dataset
path = "Datasets_intro/"
data = spark.read.csv(path+'pga_tour_historical.csv', inferSchema=True, header=True)

In [6]:
#Viewing the first 5 lines in the dataframe
data.limit(5).toPandas()

Unnamed: 0,Player Name,Season,Statistic,Variable,Value
0,Robert Garrigus,2010,Driving Distance,Driving Distance - (ROUNDS),71
1,Bubba Watson,2010,Driving Distance,Driving Distance - (ROUNDS),77
2,Dustin Johnson,2010,Driving Distance,Driving Distance - (ROUNDS),83
3,Brett Wetterich,2010,Driving Distance,Driving Distance - (ROUNDS),54
4,J.B. Holmes,2010,Driving Distance,Driving Distance - (ROUNDS),100


In [7]:
#Printing the schema 
data.printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: string (nullable = true)



In [8]:
data.describe()

DataFrame[summary: string, Player Name: string, Season: string, Statistic: string, Variable: string, Value: string]

In [10]:
#Editing the schema during the read in
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
data_schema = [
                StructField("Player Name", StringType(), True),
                StructField("Season", IntegerType(), True),
                StructField("Statistic", StringType(), True),
                StructField("Variable", StringType(), True),
                StructField("Value", IntegerType(), True)
              ]

In [11]:
final_struct = StructType(fields=data_schema)

In [13]:
path = "Datasets_intro/"
data = spark.read.csv(path+'pga_tour_historical.csv', schema=final_struct)

In [14]:
data.printSchema()

root
 |-- Player Name: string (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Statistic: string (nullable = true)
 |-- Variable: string (nullable = true)
 |-- Value: integer (nullable = true)



In [15]:
#Summary statistics only for one variable
data.describe(['Value']).show()

+-------+------------------+
|summary|             Value|
+-------+------------------+
|  count|           1657247|
|   mean|12494.388998743096|
| stddev| 157274.7567357075|
|    min|              -178|
|    max|           3564954|
+-------+------------------+



In [16]:
#Summary statistics only for two variable
data.select('Season','Value').summary('count','min','max').show()

+-------+-------+-------+
|summary| Season|  Value|
+-------+-------+-------+
|  count|2740403|1657247|
|    min|   2010|   -178|
|    max|   2018|3564954|
+-------+-------+-------+

