In [258]:
import yfinance as yf
import pandas as pd
import numpy as np
import time
import requests
from bs4 import BeautifulSoup

# Stock Data

In [259]:
company_List = ['msft','aapl','nvda','goog','avgo','asml','orcl','amd','crm','qcom','sap','adbe','csco','amat','acn','txn','intu','ibm','now','mu']
date = '2020-01-01'
#the reason for this date2 is because share data doesnt update daily and for when i merge backwards later need that date before my stock data
date2 = '2019-10-01'

In [260]:
def pull_Ticker_Data(company):
    ticker = yf.Ticker(f'{company}')
    hist = ticker.history(start=date)
    hist = hist.reset_index()
    hist['symbol'] = company
    return ticker,hist

In [261]:
def total_Shares(ticker):
    shares = ticker.get_shares_full(start=date2)
    shares = shares[~shares.index.duplicated(keep='first')]
    shares = shares.reset_index()
    shares = shares.rename(columns={0: 'shares','index': 'Date'})
    return shares

In [262]:
def combineHS(hist,shares):
    combined = pd.merge_asof(hist, shares, on='Date', direction='backward')
    return combined

In [263]:
tick,hist = pull_Ticker_Data('aapl')

In [264]:
#pull and merge data together
stock_data = pd.DataFrame()
for company in company_List:
    ticker,hist = pull_Ticker_Data(company)
    shares = total_Shares(ticker)
    newData = combineHS(hist,shares)
    stock_data = pd.concat([stock_data, newData], ignore_index=True)
    time.sleep(0.25)

In [265]:
stock_data.isna().sum()

Date            0
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
symbol          0
shares          0
dtype: int64

In [266]:
stock_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,symbol,shares
0,2020-01-02 00:00:00-05:00,152.449037,154.321282,152.016982,154.215668,22622100,0.0,0.0,msft,7628810240
1,2020-01-03 00:00:00-05:00,152.007399,153.572397,151.757756,152.295425,21116200,0.0,0.0,msft,7628810240
2,2020-01-06 00:00:00-05:00,150.816796,152.756257,150.269516,152.689041,20813700,0.0,0.0,msft,7628810240
3,2020-01-07 00:00:00-05:00,152.967486,153.303522,151.047232,151.296860,21634100,0.0,0.0,msft,7628810240
4,2020-01-08 00:00:00-05:00,152.593036,154.388484,151.652115,153.706787,27746500,0.0,0.0,msft,7725000192
...,...,...,...,...,...,...,...,...,...,...
22615,2024-06-25 00:00:00-04:00,140.070007,141.130005,137.750000,141.119995,32370000,0.0,0.0,mu,1107369984
22616,2024-06-26 00:00:00-04:00,143.039993,144.070007,139.539993,142.360001,55490300,0.0,0.0,mu,1107369984
22617,2024-06-27 00:00:00-04:00,135.789993,137.389999,131.080002,132.229996,68172700,0.0,0.0,mu,1107369984
22618,2024-06-28 00:00:00-04:00,132.820007,135.419998,130.750000,131.529999,38140600,0.0,0.0,mu,1108839936


In [267]:
stock_data.isna().sum(),stock_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22620 entries, 0 to 22619
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype                           
---  ------        --------------  -----                           
 0   Date          22620 non-null  datetime64[ns, America/New_York]
 1   Open          22620 non-null  float64                         
 2   High          22620 non-null  float64                         
 3   Low           22620 non-null  float64                         
 4   Close         22620 non-null  float64                         
 5   Volume        22620 non-null  int64                           
 6   Dividends     22620 non-null  float64                         
 7   Stock Splits  22620 non-null  float64                         
 8   symbol        22620 non-null  object                          
 9   shares        22620 non-null  int64                           
dtypes: datetime64[ns, America/New_York](1), float64(6), int64(2), object(1

(Date            0
 Open            0
 High            0
 Low             0
 Close           0
 Volume          0
 Dividends       0
 Stock Splits    0
 symbol          0
 shares          0
 dtype: int64,
 None)

# Employee Count Data

In [268]:
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
headers = {"User-Agent": user_agent}
def create_R(url):
    try:
        r = requests.get(url=url, headers=headers)
        r.raise_for_status()
        return r
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")


In [269]:
def pull_Employee_Count(r,symbol):
    data = []
    soup = BeautifulSoup(r.text, 'html.parser')
    soup = soup.find('tbody')
    rows = soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        year = cells[0].get_text(strip=True)
        value = cells[1].get_text(strip=True)
        data.append([symbol,year,value])
    return data


In [270]:
#get company full names for employee count url link 
full_Names = []
for company in company_List:
    tick,_ = pull_Ticker_Data(company)
    name = tick.info['shortName'].split()[0].replace(',','').strip().lower()
    full_Names.append(name)

In [271]:
#add fix to get website long names easier for url macrotrends.net charts. quick name fixes for now
full_Names[3] = 'google'
full_Names[5] = 'asml-holding'
full_Names[7] = 'amd'
full_Names[10] = 'sap-se'
full_Names[13] = 'applied-materials'
full_Names[15] = 'texas-instruments'
full_Names[-3] = 'ibm'
full_Names[-1] = 'micron-technology'

In [272]:
def create_df(data):
    df = pd.DataFrame(data,columns=['symbol','year','value'])
    return df

In [274]:
#pull and merge data
employee_Count = pd.DataFrame()
for i  in range(len(company_List)):
    url = f"https://www.macrotrends.net/stocks/charts/{company_List[i]}/{full_Names[i]}/number-of-employees"
    r = create_R(url)
    newData = pull_Employee_Count(r,company_List[i])
    newData = create_df(newData)
    employee_Count = pd.concat([employee_Count, newData], ignore_index=True)
    time.sleep(1)

An error occurred: 429 Client Error: Too Many Requests for url: https://www.macrotrends.net/stocks/charts/msft/microsoft/number-of-employees


AttributeError: 'NoneType' object has no attribute 'text'

In [None]:
employee_Count

Unnamed: 0,symbol,year,value
0,msft,2023,221000
1,msft,2022,221000
2,msft,2021,181000
3,msft,2020,163000
4,msft,2019,144000
5,msft,2018,131000
6,msft,2017,124000
7,msft,2016,114000
8,msft,2015,118000
9,msft,2014,128000


# Export Data