In [18]:
import pandas as pd
import concurrent.futures
import requests
import yfinance as yf
import numpy as np

In [38]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [7]:
def get_sp500_tickers():
    table = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    sp500 = table[0]
    tickers = sp500['Symbol'].tolist()
    return tickers

sp500_tickers = get_sp500_tickers()

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADM',
 'ADBE',
 'ADP',
 'AES',
 'AFL',
 'A',
 'ABNB',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AMD',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BBWI',
 'BAX',
 'BDX',
 'WRB',
 'BRK.B',
 'BBY',
 'BIO',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF.B',
 'BG',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CTLT',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'COR',
 'CNC',
 'CNP',
 'CDAY',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',


In [30]:
all_data = pd.DataFrame()
ticker_symbols = sp500_tickers

def download_stock_data(symbol):
    try:
        data = yf.Ticker(symbol).history(period="5y")
        data['Ticker'] = symbol  # Добавьте столбец с символом акции
        return data
    except Exception as e:
        print(f"Failed download: [{symbol}]: {e}")
        return None

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(download_stock_data, symbol) for symbol in ticker_symbols]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(ticker_symbols), desc="Downloading"):
        data = future.result()
        if data is not None:
            all_data = pd.concat([all_data, data], axis=0)

Downloading:  14%|█▍        | 70/503 [00:04<00:30, 14.02it/s]BRK.B: No data found, symbol may be delisted
  all_data = pd.concat([all_data, data], axis=0)
BF.B: No price data found, symbol may be delisted (period=5y)
  all_data = pd.concat([all_data, data], axis=0)
Downloading: 100%|██████████| 503/503 [00:29<00:00, 17.09it/s]


In [32]:
all_data.to_csv("yfinance_data.csv")

In [33]:
all_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-12-04 00:00:00-05:00,159.400442,161.229428,157.130944,157.245865,3108900.0,0.0,0.0,ACN,
2018-12-06 00:00:00-05:00,153.894273,154.880589,149.996875,152.496185,6012000.0,0.0,0.0,ACN,
2018-12-07 00:00:00-05:00,152.141883,153.961298,149.613846,150.772522,3090000.0,0.0,0.0,ACN,
2018-12-10 00:00:00-05:00,150.360787,151.921649,147.861477,151.031097,2280800.0,0.0,0.0,ACN,
2018-12-11 00:00:00-05:00,153.214387,154.009188,150.734232,151.586487,1786300.0,0.0,0.0,ACN,


In [36]:
all_data.columns = all_data.columns.str.lower()
all_data['diff'] = all_data['close'] - all_data['open']