Stocks Data Preprocessing

In [None]:
#ymal to csv stock-wise
import os
import yaml
import pandas as pd
from pathlib import Path

#set-input-and-output-directories
data_dir = Path("E:\Sakthi\prasanth\projects\stocks\Scripts\data")
output_dir = Path("allstocks_csv")
output_dir.mkdir(exist_ok=True)

#dictionary-to-store-data-per-stock-symbol
stock_data = {}

#loop-to-take-all-months
for month_folder in data_dir.iterdir():
    if month_folder.is_dir():
        for yaml_file in month_folder.glob("*.yaml"):
            with open(yaml_file, 'r') as f:
                try:
                    content = yaml.safe_load(f)
                except Exception as e:
                    print(f"Error loading {yaml_file.name}: {e}")
                    continue

                #handle-both-list-and-single-dict
                if isinstance(content, list):
                    entries = content
                elif isinstance(content, dict):
                    entries = [content]
                else:
                    print(f"Skipped {yaml_file.name} (unexpected format)")
                    continue

                for row_data in entries:
                    symbol = row_data.get("Ticker")
                    if not symbol:
                        continue

                    #rows-for-stocks
                    row = {
                        "date": row_data.get("date"),
                        "open": row_data.get("open"),
                        "close": row_data.get("close"),
                        "high": row_data.get("high"),
                        "low": row_data.get("low"),
                        "volume": row_data.get("volume"),
                    }

                    stock_data.setdefault(symbol, []).append(row)

#convert-to-dataframe-save-as-.csv
for symbol, rows in stock_data.items():
    df = pd.DataFrame(rows)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values("date", inplace=True)
    df.to_csv(output_dir / f"{symbol}.csv", index=False)

print(f"CSVs saved in '{output_dir.resolve()}'")


✅ Done! CSVs saved in 'E:\Sakthi\prasanth\projects\stocks\Scripts\allstocks_csv'


In [None]:
#combine all stocks as one csv file
import pandas as pd
import os

#path
csv_folder = "E:\Sakthi\prasanth\projects\stocks\Scripts\stocks_csv"

dfs = []

#loop through each CSV file in the folder
for filename in os.listdir(csv_folder):
    if filename.endswith(".csv"):
        ticker = filename.replace(".csv", "")
        df = pd.read_csv(os.path.join(csv_folder, filename))
        df["Ticker"] = ticker  # Add a column for the ticker
        dfs.append(df)

#concat all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

#save as single csv file
combined_df.to_csv("all_stocks.csv", index=False)

print("All stocks combined into 'all_stocks.csv'")


✅ All stocks combined into 'all_stocks_combined.csv'


In [1]:
#read
import pandas as pd
df = pd.read_csv('E:/Sakthi/prasanth/projects/stocks/Scripts/all_stocks.csv')
print(df.head())

         date     open    close     high      low   volume    ticker
0  2023-10-03  2418.00  2387.25  2424.90  2372.00  2019899  ADANIENT
1  2023-10-04  2402.20  2464.95  2502.75  2392.25  2857377  ADANIENT
2  2023-10-05  2477.95  2466.35  2486.50  2446.40  1132455  ADANIENT
3  2023-10-06  2466.35  2478.10  2514.95  2466.05  1510035  ADANIENT
4  2023-10-09  2440.00  2442.60  2459.70  2411.30  1408224  ADANIENT


In [None]:
#column-lowercase
df.columns = df.columns.str.lower()
print(df.head())

         date     open    close     high      low   volume    ticker
0  2023-10-03  2418.00  2387.25  2424.90  2372.00  2019899  ADANIENT
1  2023-10-04  2402.20  2464.95  2502.75  2392.25  2857377  ADANIENT
2  2023-10-05  2477.95  2466.35  2486.50  2446.40  1132455  ADANIENT
3  2023-10-06  2466.35  2478.10  2514.95  2466.05  1510035  ADANIENT
4  2023-10-09  2440.00  2442.60  2459.70  2411.30  1408224  ADANIENT


In [None]:
#date-only
df['date'] = pd.to_datetime(df['date']).dt.date
print(df.head())

         date     open    close     high      low   volume    ticker
0  2023-10-03  2418.00  2387.25  2424.90  2372.00  2019899  ADANIENT
1  2023-10-04  2402.20  2464.95  2502.75  2392.25  2857377  ADANIENT
2  2023-10-05  2477.95  2466.35  2486.50  2446.40  1132455  ADANIENT
3  2023-10-06  2466.35  2478.10  2514.95  2466.05  1510035  ADANIENT
4  2023-10-09  2440.00  2442.60  2459.70  2411.30  1408224  ADANIENT


In [5]:
#to-save
df.to_csv('all_stocks.csv', index=False)

In [14]:
df = pd.read_csv('E:/Sakthi/prasanth/projects/stocks/Scripts/all_stocks.csv')
print(df.head())

         date     open    close     high      low   volume    ticker
0  2023-10-03  2418.00  2387.25  2424.90  2372.00  2019899  ADANIENT
1  2023-10-04  2402.20  2464.95  2502.75  2392.25  2857377  ADANIENT
2  2023-10-05  2477.95  2466.35  2486.50  2446.40  1132455  ADANIENT
3  2023-10-06  2466.35  2478.10  2514.95  2466.05  1510035  ADANIENT
4  2023-10-09  2440.00  2442.60  2459.70  2411.30  1408224  ADANIENT


Sector Data Preprocessing

In [6]:
import pandas as pd
dfs = pd.read_csv('E:\Sakthi\prasanth\projects\stocks\Scripts\Sector_data - Sheet1.csv')
print(dfs.head())

             COMPANY         sector                         Symbol
0  ADANI ENTERPRISES  MISCELLANEOUS  ADANI ENTERPRISES: ADANIGREEN
1  ADANI PORTS & SEZ  MISCELLANEOUS  ADANI PORTS & SEZ: ADANIPORTS
2   APOLLO HOSPITALS  MISCELLANEOUS   APOLLO HOSPITALS: APOLLOHOSP
3       ASIAN PAINTS         PAINTS       ASIAN PAINTS: ASIANPAINT
4          AXIS BANK        BANKING            AXIS BANK: AXISBANK


In [7]:
#column-lowercase
dfs.columns = dfs.columns.str.lower()
print(dfs.head())

             company         sector                         symbol
0  ADANI ENTERPRISES  MISCELLANEOUS  ADANI ENTERPRISES: ADANIGREEN
1  ADANI PORTS & SEZ  MISCELLANEOUS  ADANI PORTS & SEZ: ADANIPORTS
2   APOLLO HOSPITALS  MISCELLANEOUS   APOLLO HOSPITALS: APOLLOHOSP
3       ASIAN PAINTS         PAINTS       ASIAN PAINTS: ASIANPAINT
4          AXIS BANK        BANKING            AXIS BANK: AXISBANK


In [None]:
#symbol-match-with-ticker
dfs['symbol'] = dfs['symbol'].str.split(':').str[1]
print(dfs.head())

             company         sector       symbol
0  ADANI ENTERPRISES  MISCELLANEOUS   ADANIGREEN
1  ADANI PORTS & SEZ  MISCELLANEOUS   ADANIPORTS
2   APOLLO HOSPITALS  MISCELLANEOUS   APOLLOHOSP
3       ASIAN PAINTS         PAINTS   ASIANPAINT
4          AXIS BANK        BANKING     AXISBANK


In [9]:
print(dfs['symbol'].unique())

[' ADANIGREEN' ' ADANIPORTS' ' APOLLOHOSP' ' ASIANPAINT' ' AXISBANK'
 ' BAJAJ-AUTO' ' BAJFINANCE' ' BAJAJFINSV' ' BEL' ' AIRTEL' ' BPCL'
 ' CIPLA' ' COALINDIA' ' DRREDDY' ' EICHERMOT' ' GRASIM' ' HCLTECH'
 ' HDFCBANK' ' HDFCLIFE' ' HEROMOTOCO' ' HINDALCO' ' HINDUNILVR'
 ' ICICIBANK' ' INDUSINDBK' ' INFY' ' IOC' ' ITC' ' JSWSTEEL' ' KOTAKBANK'
 ' LT' ' M&M' ' MARUTI' ' NESTLEIND' ' NTPC' ' ONGC' ' POWERGRID'
 ' RELIANCE' ' SBIN' ' SBILIFE' ' SHRIRAMFIN' ' SUNPHARMA' ' TATACONSUMER'
 ' TATAMOTORS' ' TATASTEEL' ' TCS' ' TECHM' ' TITAN' ' TRENT'
 ' ULTRACEMCO' ' WIPRO']


In [None]:
#remove-space
dfs['symbol'] = dfs['symbol'].str.replace(' ', '')
print(dfs['symbol'].unique())

['ADANIGREEN' 'ADANIPORTS' 'APOLLOHOSP' 'ASIANPAINT' 'AXISBANK'
 'BAJAJ-AUTO' 'BAJFINANCE' 'BAJAJFINSV' 'BEL' 'AIRTEL' 'BPCL' 'CIPLA'
 'COALINDIA' 'DRREDDY' 'EICHERMOT' 'GRASIM' 'HCLTECH' 'HDFCBANK'
 'HDFCLIFE' 'HEROMOTOCO' 'HINDALCO' 'HINDUNILVR' 'ICICIBANK' 'INDUSINDBK'
 'INFY' 'IOC' 'ITC' 'JSWSTEEL' 'KOTAKBANK' 'LT' 'M&M' 'MARUTI' 'NESTLEIND'
 'NTPC' 'ONGC' 'POWERGRID' 'RELIANCE' 'SBIN' 'SBILIFE' 'SHRIRAMFIN'
 'SUNPHARMA' 'TATACONSUMER' 'TATAMOTORS' 'TATASTEEL' 'TCS' 'TECHM' 'TITAN'
 'TRENT' 'ULTRACEMCO' 'WIPRO']


In [None]:
#mismatched-symbol-and-ticker
mismatched_ticker = df[~df['ticker'].isin(dfs['symbol'])]['ticker'].unique()
mismatched_symbol = dfs[~dfs['symbol'].isin(df['ticker'])]['symbol'].unique()

print(f"Tickers in all_stocks.csv not found in updated_sector.csv: {mismatched_ticker}")
print(f"Symbols in updated_sector.csv not found in all_stocks.csv : {mismatched_symbol}")

Tickers in all_stocks.csv not found in updated_sector.csv: ['ADANIENT' 'BHARTIARTL' 'BRITANNIA' 'TATACONSUM']
Tickers in updated_sector.csv not found in all_stocks.csv : ['ADANIGREEN' 'AIRTEL' 'IOC' 'TATACONSUMER']


In [None]:
#change-symbols-to-match-the-ticker
dfs['symbol'] = dfs['symbol'].replace('ADANIGREEN', 'ADANIENT')
dfs['symbol'] = dfs['symbol'].replace('AIRTEL', 'BHARTIARTL')
dfs['symbol'] = dfs['symbol'].replace('TATACONSUMER', 'TATACONSUM')

In [18]:
print(dfs['symbol'].unique())

['ADANIENT' 'ADANIPORTS' 'APOLLOHOSP' 'ASIANPAINT' 'AXISBANK' 'BAJAJ-AUTO'
 'BAJFINANCE' 'BAJAJFINSV' 'BEL' 'BHARTIARTL' 'BPCL' 'CIPLA' 'COALINDIA'
 'DRREDDY' 'EICHERMOT' 'GRASIM' 'HCLTECH' 'HDFCBANK' 'HDFCLIFE'
 'HEROMOTOCO' 'HINDALCO' 'HINDUNILVR' 'ICICIBANK' 'INDUSINDBK' 'INFY'
 'IOC' 'ITC' 'JSWSTEEL' 'KOTAKBANK' 'LT' 'M&M' 'MARUTI' 'NESTLEIND' 'NTPC'
 'ONGC' 'POWERGRID' 'RELIANCE' 'SBIN' 'SBILIFE' 'SHRIRAMFIN' 'SUNPHARMA'
 'TATACONSUM' 'TATAMOTORS' 'TATASTEEL' 'TCS' 'TECHM' 'TITAN' 'TRENT'
 'ULTRACEMCO' 'WIPRO']


In [None]:
#add-new-column-for-britannia
new_row = {'company': 'BRITANNIA','sector': 'FMCG', 'symbol': 'BRITANNIA'}

#create-datafame
new_row_df = pd.DataFrame([new_row])

#Check-if-the-new-row-already-exists-in-the-DataFrame
if not ((dfs['company'] == new_row['company']) & (dfs['symbol'] == new_row['symbol'])).any():
    #append-the-new-row-to-the-DataFrame-using-pd.concat
    dfs = pd.concat([dfs, new_row_df], ignore_index=True)
else:
    print("Row already exists.")

print(dfs.tail())
print(dfs[dfs['company'] == 'BRITANNIA'])

             company     sector      symbol
46             TITAN  RETAILING       TITAN
47             TRENT  RETAILING       TRENT
48  ULTRATECH CEMENT     CEMENT  ULTRACEMCO
49             WIPRO   SOFTWARE       WIPRO
50         BRITANNIA       FMCG   BRITANNIA
      company sector     symbol
50  BRITANNIA   FMCG  BRITANNIA


In [None]:
#to-delete-company
dfs = dfs[dfs['company'] != 'IOC']

In [22]:
print(dfs['symbol'].unique())

['ADANIENT' 'ADANIPORTS' 'APOLLOHOSP' 'ASIANPAINT' 'AXISBANK' 'BAJAJ-AUTO'
 'BAJFINANCE' 'BAJAJFINSV' 'BEL' 'BHARTIARTL' 'BPCL' 'CIPLA' 'COALINDIA'
 'DRREDDY' 'EICHERMOT' 'GRASIM' 'HCLTECH' 'HDFCBANK' 'HDFCLIFE'
 'HEROMOTOCO' 'HINDALCO' 'HINDUNILVR' 'ICICIBANK' 'INDUSINDBK' 'INFY'
 'ITC' 'JSWSTEEL' 'KOTAKBANK' 'LT' 'M&M' 'MARUTI' 'NESTLEIND' 'NTPC'
 'ONGC' 'POWERGRID' 'RELIANCE' 'SBIN' 'SBILIFE' 'SHRIRAMFIN' 'SUNPHARMA'
 'TATACONSUM' 'TATAMOTORS' 'TATASTEEL' 'TCS' 'TECHM' 'TITAN' 'TRENT'
 'ULTRACEMCO' 'WIPRO' 'BRITANNIA']


In [None]:
#to-save
dfs.to_csv('updated_sector.csv', index=False)

To store in Tidb database for Streamlit Analytics

In [None]:
#tidb-database-connection
import pymysql
from sqlalchemy import create_engine

#_tidb_database
host = "gateway01.eu-central-1.prod.aws.tidbcloud.com"
port = 4000
user = "42aq8sKC2dkkKnC.root"
password = "YOUR PASSWORD"
database = "stocks"
ssl_args = "?ssl_ca=/etc/ssl/certs/ca-certificates.crt"

engine = create_engine(
    f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}{ssl_args}"
)

In [None]:
#to-upload
#already-all_stocks-table-created-in-tidb-database
df.to_sql('all_stocks', con=engine, if_exists='replace', index=False)

print("all_stocks.csv uploaded successfully to the 'all_stocks' table in the 'stocks' database.")