In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
list_fn = []
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        list_fn.append(os.path.join(dirname, filename))
        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#create df
df_stocks = pd.read_csv(list_fn[0])
df_prices = pd.read_csv(list_fn[1])

In [None]:
df_stocks

### df_stocks provide ticker symbol, exchange, full name, sector, and industry[](http://)

In [None]:
df_prices

### df_prices shows ticker symbol, open, close, adjusted closing price, low, high, volume for each day

#### <span style="color:blue"> adjusted closing price = closing price analyzes the stock's dividends, stock splits and new stock offerings to determine an adjusted value. The adjusted closing price reflects the change in stock value caused by new offerings from the corporation</span>

#### <span style="color:red"> volume = how much of a given financial asset has traded in a period of time</span>

In [None]:
# df_stocks analysis
print(df_stocks.keys())
# get numeric data
num_cols = df_stocks._get_numeric_data().columns
print(num_cols) #no numeric columns in df_stocks

#df_prices analysis
print(df_prices.keys())
# get numeric data
num_cols = df_prices._get_numeric_data().columns
print(num_cols) #open,close,adj_close,low,high,volume are num_cols
cat_cols = list(set(df_prices.columns) - set(num_cols))
print(cat_cols) #date,ticker are cat_cols

### df_stocks has 5 attributes
### df_prices has 8 attributes, 2 of which are categorical and 6 are numerical


## Look for missing data

In [None]:
# of missing data in df_stocks
print("df_stocks")
print(df_stocks.isnull().sum().sort_values(ascending=False))
print()
print("% data missing")
print(df_stocks.isnull().sum().sort_values(ascending=False)/len(df_stocks)*100)

print()
print("df_prices")
print(df_prices.isnull().sum().sort_values(ascending=False))
print()
print("% data missing")
print(df_prices.isnull().sum().sort_values(ascending=False)/len(df_prices)*100)

### df_stocks has 1440 sector and industry values missing 
### df_prices has 0 values missing

In [None]:
# create df with missing values
df_null = df_stocks[df_stocks.isnull().any(axis=1)]
df_null



In [None]:
# Look at unique values in each df
print("df_stocks")
for i in df_stocks.keys():
    print(i, ": ", len(pd.unique(df_stocks[i])))
print()
print("df_prices")
for i in cat_cols: #cat_cols is categorical values in df_prices
    print(i, ": ", len(pd.unique(df_prices[i])))

#### number of unique tickers in df_stocks and df_prices do not match
#### need to create df with matching tickers in both df_stocks and df_prices

In [None]:
#find common tickers
common_tickers = list(set(df_stocks.ticker) & set(df_prices.ticker))
print(len(common_tickers))

#all tickers in df_prices are in df_stocks

### Now lets look at df_prices and % of growth in closing price vs adjusted closing price of every ticker in each decade

In [None]:
# decades to look at
# 1970-1980 = 1
# 1981-1990 = 2
# 1991-2000 = 3
# 2001-2010 = 4
# 2011-2018 = 5

# convert date column to datetime
df_prices['date'] = pd.to_datetime(df_prices['date'])

# boolean mask to find rows that are in start date and end date range
# create 5 different dfs from each decade of interest for df_prices
mask = (df_prices['date'] > '1970-01-01') & (df_prices['date'] <= '1980-12-31')
df_prices1 = df_prices.loc[mask]

mask = (df_prices['date'] > '1981-01-01') & (df_prices['date'] <= '1990-12-31')
df_prices2 = df_prices.loc[mask]

mask = (df_prices['date'] > '1991-01-01') & (df_prices['date'] <= '2000-12-31')
df_prices3 = df_prices.loc[mask]

mask = (df_prices['date'] > '2001-01-01') & (df_prices['date'] <= '2010-12-31')
df_prices4 = df_prices.loc[mask]

mask = (df_prices['date'] > '2011-01-01') & (df_prices['date'] <= '2018-12-31')
df_prices5 = df_prices.loc[mask]




In [None]:
print(df_prices1)
print(df_prices2)
print(df_prices3)
print(df_prices4)
print(df_prices5)

In [None]:
# common tickers
print(common_tickers)


### Look at 1970-1980 data

In [None]:
common_tickers = pd.Series(common_tickers)
common_tickers.index += 1

In [None]:
df_prices1.loc[df_prices1['ticker']=='AAPL','close']
df_prices1.loc[df_prices1['ticker']=='AAPL','date']

In [None]:
import matplotlib.pyplot as plt
total_tickers = pd.unique(df_prices1['ticker'])
total_tickers_num = len(pd.unique(df_prices1['ticker']))
fig=plt.figure(figsize=(30,400))
columns=5
rows=76
for i in range(1,columns*rows+1): #total number of tickers
    ticker = common_tickers[i]
    fig.add_subplot(rows,columns,i)
    x = df_prices1.loc[df_prices1['ticker']== ticker,'date']
    y = df_prices1.loc[df_prices1['ticker']== ticker,'close']
    plt.plot(x,y,label = ticker)
    plt.xticks(rotation = 45)
    plt.legend(loc='best')
plt.savefig('df_prices1.png')
    
    



In [None]:
fig=plt.figure(figsize=(30,400))
columns=5
rows=76
for i in range(1,columns*rows+1): #total number of tickers
    ticker = common_tickers[i]
    fig.add_subplot(rows,columns,i)
    x = df_prices2.loc[df_prices2['ticker']== ticker,'date']
    y = df_prices2.loc[df_prices2['ticker']== ticker,'close']
    plt.plot(x,y,label = ticker)
    plt.xticks(rotation = 45)
    plt.legend(loc='best')

In [None]:
fig=plt.figure(figsize=(30,400))
columns=5
rows=76
for i in range(1,columns*rows+1): #total number of tickers
    ticker = common_tickers[i]
    fig.add_subplot(rows,columns,i)
    x = df_prices3.loc[df_prices3['ticker']== ticker,'date']
    y = df_prices3.loc[df_prices3['ticker']== ticker,'close']
    plt.plot(x,y,label = ticker)
    plt.xticks(rotation = 45)
    plt.legend(loc='best')

In [None]:
fig=plt.figure(figsize=(30,400))
columns=5
rows=76
for i in range(1,columns*rows+1): #total number of tickers
    ticker = common_tickers[i]
    fig.add_subplot(rows,columns,i)
    x = df_prices4.loc[df_prices4['ticker']== ticker,'date']
    y = df_prices4.loc[df_prices4['ticker']== ticker,'close']
    plt.plot(x,y,label = ticker)
    plt.xticks(rotation = 45)
    plt.legend(loc='best')