# Walmart Stock Analysis

Walmart Stock market data from the years 2012-2017.

#### Importing spark and loading dependencies

In [83]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark

#### Starting a simple Spark Session

In [84]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('walmart_stock').getOrCreate()

#### Loading the Walmart Stock CSV File, having Spark infer the data types.

In [85]:
walmart_df = spark.read.csv('walmart_stock.csv', header=True, inferSchema=True)

In [86]:
# print out number of rows and columns
print((walmart_df.count(), len(walmart_df.columns)))

(1258, 7)


#### Printing the clomn names

In [87]:
walmart_df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

#### print the schema

In [88]:
walmart_df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



#### Print out the first 5 columns.

In [89]:
walmart_df.head(5)

[Row(Date=datetime.datetime(2012, 1, 3, 0, 0), Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996),
 Row(Date=datetime.datetime(2012, 1, 4, 0, 0), Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475),
 Row(Date=datetime.datetime(2012, 1, 5, 0, 0), Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539),
 Row(Date=datetime.datetime(2012, 1, 6, 0, 0), Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922),
 Row(Date=datetime.datetime(2012, 1, 9, 0, 0), Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)]

#### Use describe() to learn about the DataFrame.

In [90]:
walmart_df.describe().show()

+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|summary|              Open|             High|              Low|            Close|           Volume|        Adj Close|
+-------+------------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|  count|              1258|             1258|             1258|             1258|             1258|             1258|
|   mean| 72.35785375357709|72.83938807631165| 71.9186009594594|72.38844998012726|8222093.481717011|67.23883848728146|
| stddev|  6.76809024470826|6.768186808159218|6.744075756255496|6.756859163732991|  4519780.8431556|6.722609449996857|
|    min|56.389998999999996|        57.060001|        56.299999|        56.419998|          2094900|        50.363689|
|    max|         90.800003|        90.970001|            89.25|        90.470001|         80898100|84.91421600000001|
+-------+------------------+-----------------+--

#### Format the columns of walmart_df.describe() to 2 significant digits

In [91]:
# printing the schema of walmart_df.describe()
walmart_df.describe().printSchema()

root
 |-- summary: string (nullable = true)
 |-- Open: string (nullable = true)
 |-- High: string (nullable = true)
 |-- Low: string (nullable = true)
 |-- Close: string (nullable = true)
 |-- Volume: string (nullable = true)
 |-- Adj Close: string (nullable = true)



> All the columns are of type string. We would need to cast the columns in order to do so.
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.Column.cast

In [92]:
from pyspark.sql.functions import format_number

In [93]:
# store the description in a var
walmart_desc = walmart_df.describe()

In [94]:
# cast the result
walmart_desc.select(walmart_desc['summary'],
                   format_number(walmart_desc['Open'].cast('float'),2).alias('Open'),
                   format_number(walmart_desc['High'].cast('float'),2).alias('High'),
                   format_number(walmart_desc['Low'].cast('float'),2).alias('Low'),
                   format_number(walmart_desc['Close'].cast('float'),2).alias('Close'),
                   walmart_desc['Volume'].cast('int').alias('Volume')
                   ).show()

+-------+--------+--------+--------+--------+--------+
|summary|    Open|    High|     Low|   Close|  Volume|
+-------+--------+--------+--------+--------+--------+
|  count|1,258.00|1,258.00|1,258.00|1,258.00|    1258|
|   mean|   72.36|   72.84|   71.92|   72.39| 8222093|
| stddev|    6.77|    6.77|    6.74|    6.76| 4519780|
|    min|   56.39|   57.06|   56.30|   56.42| 2094900|
|    max|   90.80|   90.97|   89.25|   90.47|80898100|
+-------+--------+--------+--------+--------+--------+



#### Create a new dataframe with a column called HV Ratio that is the ratio of the High Price versus volume of stock traded for a day.

In [95]:
walmart_df_hv_ratio = walmart_df.withColumn('HV Ratio', (walmart_df['High'] / walmart_df['Volume']))
walmart_df_hv_ratio.select('HV Ratio').show()

+--------------------+
|            HV Ratio|
+--------------------+
|4.819714653321546E-6|
|6.290848613094555E-6|
|4.669412994783916E-6|
|7.367338463826307E-6|
|8.915604778943901E-6|
|8.644477436914568E-6|
|9.351828421515645E-6|
| 8.29141562102703E-6|
|7.712212102001476E-6|
|7.071764823529412E-6|
|1.015495466386981E-5|
|6.576354146362592...|
| 5.90145296180676E-6|
|8.547679455011844E-6|
|8.420709512685392E-6|
|1.041448341728929...|
|8.316075414862431E-6|
|9.721183814992126E-6|
|8.029436027707578E-6|
|6.307432259386365E-6|
+--------------------+
only showing top 20 rows



#### What day had the Peak High in Price?

In [96]:
# order by high column and take the head
walmart_df.orderBy(walmart_df['High'].desc()).head(1)

[Row(Date=datetime.datetime(2015, 1, 13, 0, 0), Open=90.800003, High=90.970001, Low=88.93, Close=89.309998, Volume=8215400, Adj Close=83.825448)]

In [97]:
# index the object above to get the date time
walmart_df.orderBy(walmart_df['High'].desc()).head(1)[0][0]

datetime.datetime(2015, 1, 13, 0, 0)

#### What is the mean of the Close column?

In [98]:
from pyspark.sql.functions import mean
walmart_df.select(mean('Close')).show()

+-----------------+
|       avg(Close)|
+-----------------+
|72.38844998012726|
+-----------------+



#### What is the max and min of the Volume column?

In [99]:
from pyspark.sql.functions import max, min

In [100]:
walmart_df.select(max('Volume'), min('Volume')).show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|   80898100|    2094900|
+-----------+-----------+



#### How many days was the Close lower than 60 dollars?

In [101]:
# filter the data less than 60 and count
walmart_df.filter('Close < 60').count()

81

In [102]:
walmart_df.filter(walmart_df['Close'] < 60).count()

81

In [103]:
# pyspark has count function as well
from pyspark.sql.functions import count
walmart_df.filter(walmart_df['Close'] < 60).select(count('Close')).show()

+------------+
|count(Close)|
+------------+
|          81|
+------------+



#### What percentage of the time was the High greater than 80 dollars ?
#### In other words, (Number of Days High>80)/(Total Days in the dataset)

In [104]:
(walmart_df.filter(walmart_df['High']>80).count() / walmart_df.count())*100 #multiply 100 for percent

9.141494435612083

#### What is the Pearson correlation between High and Volume?
http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameStatFunctions.corr

In [105]:
# import the correlation fucntion from pyspark
from pyspark.sql.functions import corr

In [106]:
walmart_df.select(corr(walmart_df['High'], walmart_df['Volume'])).show()

+-------------------+
| corr(High, Volume)|
+-------------------+
|-0.3384326061737161|
+-------------------+



#### What is the max High per year?

In [107]:
# import year
from pyspark.sql.functions import year

In [108]:
walmart_df_year = walmart_df.withColumn('Year', year(walmart_df['Date']))
# display first five rows of new year column
walmart_df_year.select('Year').head(5)

[Row(Year=2012),
 Row(Year=2012),
 Row(Year=2012),
 Row(Year=2012),
 Row(Year=2012)]

In [109]:
walmart_df_year_max = walmart_df_year.groupBy('Year').max()
# display head
walmart_df_year_max.head()

Row(Year=2015, max(Open)=90.800003, max(High)=90.970001, max(Low)=89.25, max(Close)=90.470001, max(Volume)=80898100, max(Adj Close)=84.91421600000001, max(Year)=2015)

In [110]:
walmart_df_year_max.select('Year', 'max(High)').show()

+----+---------+
|Year|max(High)|
+----+---------+
|2015|90.970001|
|2013|81.370003|
|2014|88.089996|
|2012|77.599998|
|2016|75.190002|
+----+---------+



#### What is the average Close for each Calendar Month?
#### In other words, across all the years, what is the average Close price for Jan,Feb, Mar, etc... Your result will have a value for each of these months. 

In [111]:
# import month
from pyspark.sql.functions import month

In [112]:
walmart_df_month = walmart_df.withColumn('Month', month('Date'))
# display first five rows of new month column
walmart_df_month.select('Month').head(5)

[Row(Month=1), Row(Month=1), Row(Month=1), Row(Month=1), Row(Month=1)]

In [113]:
walmart_df_month_avg = walmart_df_month.select(['Month', 'Close']).groupBy('Month').mean()
# show head
walmart_df_month_avg.head()

Row(Month=12, avg(Month)=12.0, avg(Close)=72.84792478301885)

In [114]:
walmart_df_month_avg.select(['Month', 'avg(Close)']).orderBy('Month').show()

+-----+-----------------+
|Month|       avg(Close)|
+-----+-----------------+
|    1|71.44801958415842|
|    2|  71.306804443299|
|    3|71.77794377570092|
|    4|72.97361900952382|
|    5|72.30971688679247|
|    6| 72.4953774245283|
|    7|74.43971943925233|
|    8|73.02981855454546|
|    9|72.18411785294116|
|   10|71.57854545454543|
|   11| 72.1110893069307|
|   12|72.84792478301885|
+-----+-----------------+



---