## Importing libraries and independencies

In [1]:
# Ignoring Future Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas_datareader as web
import pandas as pd
from yahoo_fin import stock_info as si
import datetime as dt
# yfinance is where we will pull our data from
import yfinance as yf

%matplotlib inline
import matplotlib.pyplot as plt 
plt.style.use("seaborn")

## Pulling the data

In [2]:
# Reading the stocks_spy_data.csv
stocks = pd.read_csv("stocks_spy_data.csv", header = [0,1], index_col = [0], parse_dates = [0])

In [3]:
stocks.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010-01-04,20.434929,4.496877,38.432693,6.544687,,22.101801,8.74,19.700306,33.235332,37.09,...,4741400.0,2670400.0,27809100.0,1051400.0,,2962274.0,805872.0,168800.0,3974600.0,
2010-01-05,20.212959,5.005957,38.204258,6.556002,,21.944105,8.53,19.541143,33.440731,37.700001,...,5644300.0,4321400.0,30174700.0,763400.0,,3298757.0,1769643.0,168800.0,5605500.0,
2010-01-06,20.141132,4.798553,38.537392,6.451722,,21.736618,8.4,19.649664,33.79623,37.619999,...,2738800.0,2164500.0,35044700.0,1595100.0,,4178981.0,1315619.0,385300.0,12615200.0,
2010-01-07,20.115025,4.939965,38.527885,6.439793,,21.388027,8.4,19.812443,33.764637,36.889999,...,2388500.0,3041700.0,27192100.0,1096100.0,,2452472.0,1734005.0,183600.0,24716800.0,
2010-01-08,20.108498,4.845692,38.680153,6.482608,,21.62042,8.23,19.913734,33.630333,36.689999,...,1539800.0,1599100.0,24891800.0,803600.0,,3772392.0,2213985.0,266500.0,6903000.0,


## Data Analysis

In [4]:
# Dropping all the tickers which have less than 3085 days of data i-e joined after 2010 (we are left with 443 stocks out of 505)
stocks.dropna(axis = 1, thresh = 3085, inplace = True)

In [5]:
stocks

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABC,ABMD,ABT,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XRAY,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2010-01-04,20.434929,4.496877,38.432693,6.544687,22.101801,8.740000,19.700306,33.235332,37.090000,23.390224,...,348017.0,1832400.0,4741400.0,2670400.0,27809100.0,1051400.0,2962274.0,805872.0,168800.0,3974600.0
2010-01-05,20.212959,5.005957,38.204258,6.556002,21.944105,8.530000,19.541143,33.440731,37.700001,23.353294,...,339523.0,1724500.0,5644300.0,4321400.0,30174700.0,763400.0,3298757.0,1769643.0,168800.0,5605500.0
2010-01-06,20.141132,4.798553,38.537392,6.451722,21.736618,8.400000,19.649664,33.796230,37.619999,23.308983,...,585049.0,2254400.0,2738800.0,2164500.0,35044700.0,1595100.0,4178981.0,1315619.0,385300.0,12615200.0
2010-01-07,20.115025,4.939965,38.527885,6.439793,21.388027,8.400000,19.812443,33.764637,36.889999,23.124338,...,328689.0,1420700.0,2388500.0,3041700.0,27192100.0,1096100.0,2452472.0,1734005.0,183600.0,24716800.0
2010-01-08,20.108498,4.845692,38.680153,6.482608,21.620420,8.230000,19.913734,33.630333,36.689999,23.257278,...,279388.0,1310000.0,1539800.0,1599100.0,24891800.0,803600.0,3772392.0,2213985.0,266500.0,6903000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-04-04,134.809998,18.230000,211.919998,178.440002,154.750000,325.700012,118.408058,343.447357,468.809998,164.500000,...,474900.0,3722900.0,2171400.0,2082300.0,20869400.0,1036400.0,1980400.0,1349700.0,492300.0,1220400.0
2022-04-05,133.259995,17.840000,211.729996,175.059998,155.759995,322.660004,119.005676,341.802246,458.579987,159.979996,...,922700.0,2985100.0,2807100.0,2979400.0,21753100.0,1410600.0,2010900.0,2187100.0,456700.0,1384200.0
2022-04-06,132.710007,17.379999,212.830002,171.830002,158.139999,313.760010,119.463852,339.249847,444.329987,160.279999,...,782700.0,3245900.0,2926500.0,3350600.0,24768200.0,1706000.0,1683100.0,2057700.0,442300.0,1568200.0
2022-04-07,135.929993,16.950001,217.580002,172.139999,162.919998,317.519989,122.880234,340.755341,452.720001,161.250000,...,542000.0,3800800.0,3437600.0,3228700.0,26122000.0,1240400.0,2008900.0,1800200.0,464500.0,1093500.0


In [6]:
# Checking for null count in each column
stocks.isna().sum()

Adj Close  A       4
           AAL     4
           AAP     4
           AAPL    4
           ABC     4
                  ..
Volume     XRAY    4
           YUM     4
           ZBH     4
           ZBRA    4
           ZION    4
Length: 2652, dtype: int64

In [7]:
# Lets drop the days when data is missing most probably the trading did not happen that day
stocks.dropna(inplace = True)

In [8]:
# Checking for null count in each column
stocks.isna().sum()

Adj Close  A       0
           AAL     0
           AAP     0
           AAPL    0
           ABC     0
                  ..
Volume     XRAY    0
           YUM     0
           ZBH     0
           ZBRA    0
           ZION    0
Length: 2652, dtype: int64

In [9]:
# Lets save the stocks of 443 stocks in to csv
stocks.to_csv("stocks_spy_updated.csv")

In [10]:
# Make a new ticker list with 443 stocks which has thresh = 3085
tickers = list(stocks['Adj Close'].columns)
# saving into csv
tickers = pd.DataFrame(tickers)
tickers.to_csv("tickers_updated.csv", index = None)

In [11]:
# converting it back into list for use in the for loop
tickers = tickers[0].values.tolist()
ticker = [item for sublist in tickers for item in sublist]

In [12]:
# Lets make a a separate dataframe for every column separately for all the tickers
stocks_adj_close = stocks['Adj Close'][:]
stocks_adj_close.to_csv("stocks_adj_close.csv")
stocks_close = stocks['Close'][:]
stocks_close.to_csv("stocks_close.csv")
stocks_volume = stocks['Volume'][:]
stocks_volume.to_csv("stocks_volume.csv")
stocks_high = stocks['High'][:]
stocks_high.to_csv("stocks_high.csv")
stocks_low = stocks['Low'][:]
stocks_low.to_csv("stocks_low.csv")
stocks_open = stocks['Open'][:]
stocks_open.to_csv("stocks_open.csv")

In [13]:
# initiating a list of columns 
columns = ['Close', 'High', 'Low', 'Open', 'Volume', 'Percent Change']

In [14]:
# Creating csv files for individual stocks with first column to be Adj Close
for ticker in tickers:
    df = stocks_adj_close[ticker]
    df.to_csv(f'stock_data/{ticker}.csv')

In [15]:
# Merging all the columns for individual tickers
for ticker in tickers:
    df = pd.read_csv(f"stock_data/{ticker}.csv", index_col = [0], parse_dates = [0]) 
    df.rename(columns = {ticker : "Adj Close"}, inplace = True)
    df = pd.merge(df,stocks[columns[0]][ticker],left_on='Date',right_on='Date',how='right')
    df = pd.merge(df,stocks[columns[1]][ticker],left_on='Date',right_on='Date',how='right')
    df = pd.merge(df,stocks[columns[2]][ticker],left_on='Date',right_on='Date',how='right')
    df = pd.merge(df,stocks[columns[3]][ticker],left_on='Date',right_on='Date',how='right')
    df = pd.merge(df,stocks[columns[4]][ticker],left_on='Date',right_on='Date',how='right')
    df['Percent Change'] = df['Adj Close'].pct_change()
    
#     print(df.head())
#         df[columns[i]] = stocks[columns[i]][ticker]
    
#                          }, inplace = True)
    
    df.to_csv(f'stock_data/{ticker}.csv')

In [16]:
# Renaming the columns from Columns list
for ticker in tickers:
    df = pd.read_csv(f"stock_data/{ticker}.csv", index_col = [0], parse_dates = [0])
#     print(df.head())
    for i in range(0, len(columns)-1):
        df.rename(columns={df.columns[i+1]: columns[i]},inplace=True)
#         print(df.head())
    df.to_csv(f'stock_data/{ticker}.csv')

In [18]:
# As you can see all the stocks are not starting from same y value, 
# lets normalize the data and then plot and see the price fluctuations
# we will divide every row by the first row and multiply by 100 to covert it into %
# for example the following code is printing the normalized values of first column
# Lets normalize all the columns
norm_adj_close = (stocks_adj_close/(stocks_adj_close.iloc[0]))*100
norm_adj_close.head()

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABC,ABMD,ABT,ACN,ADBE,ADI,...,WTW,WY,WYNN,XEL,XOM,XRAY,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-05,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2010-01-06,99.644649,95.856863,100.871981,98.409389,99.054475,98.475966,100.555344,101.063073,99.787794,99.810256,...,101.609874,98.894275,98.688316,100.19204,100.864298,100.658856,99.285102,99.967677,99.231303,108.695638
2010-01-07,99.515485,98.681724,100.847098,98.227433,97.465935,98.475966,101.388349,100.968597,97.851455,99.019598,...,101.31032,98.45202,100.79589,99.759997,100.547353,101.976474,99.25651,102.260978,96.750523,120.869532
2010-01-08,99.483195,96.798505,101.245659,98.880502,98.524958,96.482999,101.9067,100.566979,97.320949,99.588855,...,101.198014,97.545313,100.073696,99.807988,100.144003,101.976474,99.285102,100.113036,96.436057,118.913043
2010-01-11,99.547776,94.915249,100.249196,98.008209,99.621792,94.724504,102.425051,100.472469,96.047741,99.019598,...,102.208892,98.186655,99.749443,100.768138,101.267622,103.294186,101.000834,102.32554,96.750523,119.637663


In [19]:
# Merging into Database
for ticker in tickers:
    df = pd.read_csv(f"stock_data/{ticker}.csv", index_col = [0], parse_dates = [0]) 
    df['Normalized Adj Close'] = norm_adj_close[ticker]
    df.to_csv(f'stock_data/{ticker}.csv')