In [108]:
import numpy as np 
import pandas as pd
from IPython.core.display import display
from datetime import datetime 

In [121]:
# Load in financial data
df = pd.read_csv("../input/prices-split-adjusted.csv")
display(df.loc[df["symbol"] != "WLTW"].head(20))

Unnamed: 0,date,symbol,open,close,low,high,volume
251,2010-01-04,A,22.453504,22.389128,22.267525,22.62518,3815500.0
252,2010-01-04,AAL,4.84,4.77,4.66,4.94,9837300.0
253,2010-01-04,AAP,40.700001,40.380001,40.360001,41.040001,1701700.0
254,2010-01-04,AAPL,30.49,30.572857,30.34,30.642857,123432400.0
255,2010-01-04,ABC,26.290001,26.629999,26.139999,26.690001,2455900.0
256,2010-01-04,ABT,26.000339,26.129884,25.870792,26.177866,10829000.0
257,2010-01-04,ACN,41.52,42.07,41.5,42.200001,3650100.0
258,2010-01-04,ADBE,36.650002,37.09,36.650002,37.299999,4710200.0
259,2010-01-04,ADI,31.790001,31.67,31.610001,32.189999,2102700.0
260,2010-01-04,ADM,31.48,31.469999,31.33,31.84,3472500.0


In [129]:
# data types of the columns of the data
a = df.loc[1,["date"]].values[0]

# make the dates into actual datetime objects for easier comparison
df["date"] = df["date"].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))

print(df.loc[1,["date"]].values[0])
print(type(df.loc[1,["date"]].values[0]))

2016-01-06 00:00:00
<class 'pandas._libs.tslib.Timestamp'>


In [160]:
def stock(name, start="2010-01-01", end="2016-12-30"):
    '''
    Returns the data for stock name between the dates start and end (exclusive). 
    Start and end are strings in the format YYYY-MM-DD.
    '''
    df_stock = df.loc[df["symbol"] == name]
    df_stock = df_stock.loc[df["date"] >= datetime.strptime(start, "%Y-%m-%d")]
    df_stock = df_stock.loc[df["date"] <= datetime.strptime(end, "%Y-%m-%d")]
    
    return df_stock

def variance(name, start="2010-01-01", end="2016-12-30"):
    '''
    Returns the variance for a stock between the dates start and end (exclusive).
    Start and end are strings in the format YYYY-MM-DD.
    '''
    return variance_n_day(name, 1, start, end)

def variance_n_day(name, n, start="2010-01-01", end="2016-12-30"):
    '''
    Returns the variance for a stock using returns for every n days between the dates start and end (exclusive)
    Start and end are strings in the format YYYY-MM-DD.
    '''
    n = n-int(n/7)*2 # stock market doesn't operate on the weekends, so remove 2 for each 7 days
    
    df_stock = stock(name, start, end)
    df_stock = df_stock.loc[::n]
    
    mean = df_stock.loc[:, ['open']].sum()/len(df_stock.index)
    variance = (df_stock.loc[:, ['open']]-mean)**2
    
    return float(variance.sum()/len(variance.index))
    
print(variance("AAPL"))
print(variance("YHOO"))
print(variance("MSFT"), "\n")

print(variance_n_day("AAPL", 1))
print(variance_n_day("AAPL", 30))
print(variance_n_day("AAPL", 180))
print(variance_n_day("AAPL", 365))
print(variance_n_day("AAPL", 1825), "\n")

for i in range(2010, 2017):
    print(variance("AAPL", "%s-01-01" % (i), "%s-01-01" % (i+1)))

802.6431658467568
130.17379566317456
116.49085877328788 

802.6431658467568
815.7518385684572
859.7336310452996
853.7132680829372
2107.7280212964874 

28.577140537271692
13.709338286821026
92.69630601034486
41.16472368021979
178.37970910846576
58.870941366235606
57.32103636444055
