# Install Mamba

https://github.com/conda-forge/miniforge#mambaforge

# Install JupyterLab and PySpark

After cloning this git repo, you can recreate this environment by running the following commands
  
    mamba create -n projectkobe jupyterlab -c conda-forge
    mamba activate projectkobe
    mamba install openjdk pyspark findspark
    jupyter lab


# Start PySpark Context

In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName="projectKobe")

22/10/05 20:39:57 WARN Utils: Your hostname, Sanjees-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.215 instead (on interface en0)
22/10/05 20:39:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 20:39:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Download NBA Data (JSON Format)

https://drive.google.com/file/d/1wPqikhwb9psAClb08IMhBr0-m9Iev87t/view?usp=sharing

# Load JSON File

In [15]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.json("nbadata.json")
df.printSchema()
df.show()



root
 |-- _corrupt_record: string (nullable = true)
 |-- age: string (nullable = true)
 |-- ast: string (nullable = true)
 |-- blk: string (nullable = true)
 |-- date: string (nullable = true)
 |-- drb: string (nullable = true)
 |-- fg: string (nullable = true)
 |-- fg3: string (nullable = true)
 |-- fg3_pct: string (nullable = true)
 |-- fg3a: string (nullable = true)
 |-- fg_pct: string (nullable = true)
 |-- fga: string (nullable = true)
 |-- ft: string (nullable = true)
 |-- ft_pct: string (nullable = true)
 |-- fta: string (nullable = true)
 |-- game_location: string (nullable = true)
 |-- game_result: string (nullable = true)
 |-- game_score: string (nullable = true)
 |-- gs: string (nullable = true)
 |-- mp: string (nullable = true)
 |-- name: string (nullable = true)
 |-- opp_id: string (nullable = true)
 |-- orb: string (nullable = true)
 |-- pf: string (nullable = true)
 |-- pts: string (nullable = true)
 |-- stl: string (nullable = true)
 |-- tm: string (nullable = true)
 |-

                                                                                

# Filter by player name

In [17]:
df = df.drop('_corrupt_record')
df = df.filter(df["name"] == "Kobe_Bryant")

# Sort by age

In [22]:
df.sort(asc('age')).show()

+------+----+----+----------+----+----+----+-------+----+------+----+----+------+----+-------------+-----------+----------+----+-----+-----------+------+----+----+----+----+---+----+----+
|   age| ast| blk|      date| drb|  fg| fg3|fg3_pct|fg3a|fg_pct| fga|  ft|ft_pct| fta|game_location|game_result|game_score|  gs|   mp|       name|opp_id| orb|  pf| pts| stl| tm| tov| trb|
+------+----+----+----------+----+----+----+-------+----+------+----+----+------+----+-------------+-----------+----------+----+-----+-----------+------+----+----+----+----+---+----+----+
|18-070|null|null|1996-11-01|null|null|null|   null|null|  null|null|null|  null|null|         null|    W (+14)|      null|null| null|Kobe_Bryant|   PHO|null|null|null|null|LAL|null|null|
|18-072|   0|   1|1996-11-03|   1|   0|   0|   null|   0|  .000|   1|   0|  null|   0|         null|     W (+6)|      -1.1|   0| 6:22|Kobe_Bryant|   MIN|   0|   1|   0|   0|LAL|   1|   1|
|18-074|   0|   0|1996-11-05|   0|   0|   0|   null|   0|  .

# Sort by PTS

In [24]:
from pyspark.sql.functions import desc, asc
from pyspark.sql.types import IntegerType
df = df.withColumn("pts",df.pts.cast(IntegerType()))
df.sort(desc('pts')).show(100)

+------+---+---+----------+---+---+---+-------+----+------+---+---+------+---+-------------+-----------+----------+---+-----+-----------+------+---+---+---+---+---+---+---+
|   age|ast|blk|      date|drb| fg|fg3|fg3_pct|fg3a|fg_pct|fga| ft|ft_pct|fta|game_location|game_result|game_score| gs|   mp|       name|opp_id|orb| pf|pts|stl| tm|tov|trb|
+------+---+---+----------+---+---+---+-------+----+------+---+---+------+---+-------------+-----------+----------+---+-----+-----------+------+---+---+---+---+---+---+---+
|27-152|  2|  1|2006-01-22|  4| 28|  7|   .538|  13|  .609| 46| 18|  .900| 20|         null|    W (+18)|      63.5|  1|41:56|Kobe_Bryant|   TOR|  2|  1| 81|  3|LAL|  3|  6|
|28-205|  3|  0|2007-03-16|  6| 23|  8|   .667|  12|  .590| 39| 11|  .917| 12|         null|     W (+5)|      50.9|  1|49:58|Kobe_Bryant|   POR|  1|  3| 65|  3|LAL|  2|  7|
|27-119|  0|  0|2005-12-20|  5| 18|  4|   .400|  10|  .581| 31| 22|  .880| 25|         null|    W (+22)|      49.7|  1|32:53|Kobe_Bryan

# Get average PTS

In [28]:
from pyspark.sql.functions import avg
df.select(avg('pts')).show()

+------------------+
|          avg(pts)|
+------------------+
|24.994799405646358|
+------------------+



# Count number of games with non-null STL

In [35]:
from pyspark.sql.functions import count
df.select(count('stl')).show()

+----------+
|count(stl)|
+----------+
|      1346|
+----------+

