In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
sc = spark.sparkContext

# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files
path = "items.json"
df = spark.read.json(path)

# The inferred schema can be visualized using the printSchema() method
df.printSchema()

root
 |-- age: string (nullable = true)
 |-- ast: string (nullable = true)
 |-- blk: string (nullable = true)
 |-- date: string (nullable = true)
 |-- drb: string (nullable = true)
 |-- fg: string (nullable = true)
 |-- fg3: string (nullable = true)
 |-- fg3_pct: string (nullable = true)
 |-- fg3a: string (nullable = true)
 |-- fg_pct: string (nullable = true)
 |-- fga: string (nullable = true)
 |-- ft: string (nullable = true)
 |-- ft_pct: string (nullable = true)
 |-- fta: string (nullable = true)
 |-- game_location: string (nullable = true)
 |-- game_result: string (nullable = true)
 |-- game_score: string (nullable = true)
 |-- gs: string (nullable = true)
 |-- mp: string (nullable = true)
 |-- name: string (nullable = true)
 |-- opp_id: string (nullable = true)
 |-- orb: string (nullable = true)
 |-- pf: string (nullable = true)
 |-- pts: string (nullable = true)
 |-- stl: string (nullable = true)
 |-- tm: string (nullable = true)
 |-- tov: string (nullable = true)
 |-- trb: strin

In [27]:
df.head()

Row(age='26-133', ast='0', blk='0', date='1994-11-04', drb='0', fg='0', fg3='0', fg3_pct=None, fg3a='0', fg_pct=None, fga='0', ft='0', ft_pct=None, fta='0', game_location=None, game_result='W (+18)', game_score='0.9', gs='0', mp='3:00', name='Alaa_Abdelnaby', opp_id='PHO', orb='1', pf='2', pts='0', stl='1', tm='SAC', tov='0', trb='1')

In [28]:
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
func =  udf (lambda x: datetime.strptime(x, '%Y-%m-%d'), DateType())

df = df.withColumn('date', func(col('date')))

In [29]:
df.head()

Row(age='26-133', ast='0', blk='0', date=datetime.date(1994, 11, 4), drb='0', fg='0', fg3='0', fg3_pct=None, fg3a='0', fg_pct=None, fga='0', ft='0', ft_pct=None, fta='0', game_location=None, game_result='W (+18)', game_score='0.9', gs='0', mp='3:00', name='Alaa_Abdelnaby', opp_id='PHO', orb='1', pf='2', pts='0', stl='1', tm='SAC', tov='0', trb='1')

In [40]:
dates = ("2018-08-01")
date_from = [to_date(lit(s)).cast(TimestampType()) for s in dates]

df2 = df.filter(col("date") > unix_timestamp(lit('2018-09-01 00:00:00')).cast('timestamp'))



In [39]:
df2.head()

Row(age='25-076', ast='0', blk='0', date=datetime.date(2018, 10, 16), drb='2', fg='3', fg3='2', fg3_pct='.333', fg3a='6', fg_pct='.375', fga='8', ft='0', ft_pct=None, fta='0', game_location='@', game_result='L (-8)', game_score='3.4', gs='0', mp='23:28', name='Álex_Abrines', opp_id='GSW', orb='0', pf='2', pts='8', stl='0', tm='OKC', tov='0', trb='2')

In [42]:
df2.write.csv('games2018+.csv')

In [44]:
df2.toPandas().to_csv('games2018+.csv')
