In [0]:
#Spark DataFrames Practice

In [0]:
#1.) Start a simple Spark session

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate()

In [0]:
#2.) Load the Walmart dataset

df = spark.read.table('walmart_stock_1_csv')
df.show()

+-------------------+------------------+------------------+------------------+------------------+--------+------------------+
|               Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+--------+------------------+
|2012-01-03 00:00:00|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|
|2012-01-04 00:00:00|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|
|2012-01-05 00:00:00|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|
|2012-01-06 00:00:00|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51.45922|
|2012-01-09 00:00:00|         59.029999|         59.549999|         58.919998|             59.18| 6679300|51.616215000

In [0]:
#3.) What are the column names?

df.columns

Out[4]: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [0]:
#4.) What does the schema look like?

df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [0]:
#5.) Print out the first 5 rows

df.head(5)

Out[6]: [Row(Date=datetime.datetime(2012, 1, 3, 0, 0), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996),
 Row(Date=datetime.datetime(2012, 1, 4, 0, 0), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475),
 Row(Date=datetime.datetime(2012, 1, 5, 0, 0), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539),
 Row(Date=datetime.datetime(2012, 1, 6, 0, 0), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922),
 Row(Date=datetime.datetime(2012, 1, 9, 0, 0), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)]

In [0]:
#6.) Use describe() to learn about dataset

df.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

In [0]:
#Bonus question: Too many decimals for mean and std-dev in describe() function; format numbers to 
# just show 2 decimals

In [0]:
# from pyspark.sql.functions import format_number, countDistinct, avg, stddev

# # df.select(stddev('Sales').alias('STD'))
# describe_table[1:]

df.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



In [0]:
from pyspark.sql.functions import format_number

describe_table = df.describe()
describe_table.select(describe_table['summary'],
                      format_number(describe_table['Open'].cast('float'),2).alias('Open'),
                      format_number(describe_table['High'].cast('float'),2).alias('High'),
                      format_number(describe_table['Low'].cast('float'),2).alias('Low'),
                      format_number(describe_table['Close'].cast('float'),2).alias('Close'),
                      format_number(describe_table['Volume'].cast('float'),2).alias('Volume'),
                      format_number(describe_table['Adj Close'].cast('float'),2).alias('Adj Close')
).show()

+-------+--------+--------+--------+--------+-------------+---------+
|summary|    Open|    High|     Low|   Close|       Volume|Adj Close|
+-------+--------+--------+--------+--------+-------------+---------+
|  count|1,258.00|1,258.00|1,258.00|1,258.00|     1,258.00| 1,258.00|
|   mean|   72.36|   72.84|   71.92|   72.39| 8,222,093.50|    67.24|
| stddev|    6.77|    6.77|    6.74|    6.76| 4,519,781.00|     6.72|
|    min|   56.39|   57.06|   56.30|   56.42| 2,094,900.00|    50.36|
|    max|   90.80|   90.97|   89.25|   90.47|80,898,096.00|    84.91|
+-------+--------+--------+--------+--------+-------------+---------+



In [0]:
#Bonus Part 2: Create new dataframe with column called HV Ratio --> High Price / Volume for a day, just show new column

In [0]:
df.withColumn('HV Ratio',df['High']/df['Volume']).show()

+-------------------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|               Date|              Open|              High|               Low|             Close|  Volume|         Adj Close|            HV Ratio|
+-------------------+------------------+------------------+------------------+------------------+--------+------------------+--------------------+
|2012-01-03 00:00:00|         59.970001|         61.060001|         59.869999|         60.330002|12668800|52.619234999999996|4.819714653321546E-6|
|2012-01-04 00:00:00|60.209998999999996|         60.349998|         59.470001|59.709998999999996| 9593300|         52.078475|6.290848613094555E-6|
|2012-01-05 00:00:00|         59.349998|         59.619999|         58.369999|         59.419998|12768200|         51.825539|4.669412994783916E-6|
|2012-01-06 00:00:00|         59.419998|         59.450001|         58.869999|              59.0| 8069400|          51

In [0]:
#Bonus Part 3: Which day had the highest price?

In [0]:
df.orderBy(df['High'].desc()).head(1)[0][0]

Out[12]: datetime.datetime(2015, 1, 13, 0, 0)

In [0]:
#Bonus Part 4: What is the mean of the Close column?
df.agg({'Close':'mean'}).show()
#Bonus Part 5: What are the min, max of Volume column?
df.agg({'Volume':'min'}).show()
df.agg({'Volume':'max'}).show()


+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+

+-----------+
|min(Volume)|
+-----------+
|    2094900|
+-----------+

+-----------+
|max(Volume)|
+-----------+
|   80898100|
+-----------+



In [0]:
#Bonus Part 6: How many days was Close lower than 60 dollars?

In [0]:
result = df.filter(df['Close']<60)
num_rows = result.count()
num_rows


Out[14]: 81

In [0]:
# What %age of time was High greater than 80 dollars?
# # of days high > 80 / total number of days in dataset
num = df.filter(df['High']>80).count()
denom = df.count()
(num/denom)*100


Out[15]: 9.141494435612083

In [0]:
#Correlation between High and volume

In [0]:
from pyspark.sql.functions import corr
df.select(corr('High','Volume')).show()


+-------------------+
| corr(High, Volume)|
+-------------------+
|-0.3384326061737161|
+-------------------+



In [0]:
#Max high per year?

In [0]:
from pyspark.sql.functions import year, month
newdf = df.withColumn("Year",year(df['Date']))
result = newdf.groupBy("Year").max().select(['Year','max(High)'])
result.show()

+----+---------+
|Year|max(High)|
+----+---------+
|2015|90.970001|
|2013|81.370003|
|2014|88.089996|
|2012|77.599998|
|2016|75.190002|
+----+---------+



In [0]:
#Average close for each month?
newdf = df.withColumn("Month",month(df['Date']))
result = newdf.groupBy("Month").avg().select(['Month','avg(Close)']).orderBy('Month')
result.show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+

