In [16]:
import pandas as pd
import glob

# Get a list of all xlsx files in the "Resources" folder
file_paths = glob.glob('Resources/2023*.xlsx')

# Initialize an empty list to store individual DataFrames
dfs = []

# Iterate over each file path and read the Excel file into a DataFrame
for file_path in file_paths:
    df = pd.read_excel(file_path, engine='openpyxl')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the 'ShortSqueeze.com Short Interest Data' column
combined_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)

# Drop unnecessary columns
columns_to_drop = ['Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo', 'Shares: Float',
                   'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High', '(abs)',
                   '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2']

# Check if the columns exist in the dataframe before dropping them
columns_to_drop = [col for col in columns_to_drop if col in combined_df.columns]

combined_df.drop(columns_to_drop, axis=1, inplace=True)

# Convert 'Short % of Float' column to numeric
combined_df['Short % of Float'] = pd.to_numeric(combined_df['Short % of Float'], errors='coerce')

# Filter by Short % of Float >= 20
combined_df = combined_df[combined_df['Short % of Float'] >= 20]

# Convert 'Market Cap' column to numeric
combined_df['Market Cap'] = pd.to_numeric(combined_df['Market Cap'], errors='coerce')

# Drop columns where Market Cap is less than 300,000,000
combined_df = combined_df[combined_df['Market Cap'] >= 300000000]

# Replace 'Record Date' values
date_mapping = {
    'JanA': '01-11', 'JanB': '01-25',
    'FebA': '02-09', 'FebB': '02-27',
    'MarA': '03-09', 'MarB': '03-24',
    'AprA': '04-12', 'AprB': '04-25',
    'MayA': '05-09', 'MayB': '05-24',
    'JunA': '06-09', 'JunB': '06-27'
}

combined_df['Record Date'] = combined_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}')

# Reset the index
combined_df.reset_index(drop=True, inplace=True)

# Define the output file path
output_file_path = 'Resources/2023_Short-Interest-Data.csv'

# Export the DataFrame to CSV
combined_df.to_csv(output_file_path, index=False)

# Display the combined dataframe
combined_df



Unnamed: 0,Company Name,Symbol,Short % of Float,% Insider Ownership,% Institutional Ownership,Price,Market Cap,Exchange,Sector,Industry,Record Date
0,Allogene Therapeutics Inc,ALLO,41.77,29.93,65.01,7.99,1.130905e+09,NAS,Healthcare,Biotechnology,2023-01-11
1,ALX Oncology Holdings Inc. - Common Stock,ALXO,23.69,,,9.55,3.841965e+08,NAS,Healthcare,Biotechnology,2023-01-11
2,Amyris Inc,AMRS,20.18,29.73,49.1,1.62,4.779000e+08,NAS,Basic Materials,Specialty Chemicals,2023-01-11
3,Anaptysbio Inc,ANAB,20.57,0.35,,24.9,6.815130e+08,NAS,Healthcare,Biotechnology,2023-01-11
4,Arcutis Biotherapeutics Inc. - Common stock,ARQT,24.70,4.32,89.57,16.13,8.098873e+08,NAS,Healthcare,Biotechnology,2023-01-11
...,...,...,...,...,...,...,...,...,...,...,...
584,UWM Holdings Corporation Class A Common Stock,UWMC,25.65,3.99,45.59,5.55,5.722605e+08,NY,Financial Services,Mortgage Finance,2023-04-12
585,Voya Financial Inc,VOYA,24.26,0.89,,75.16,9.111647e+09,NY,Financial Services,Financial Conglomerates,2023-04-12
586,Wayfair Inc Class A,W,37.87,5.91,,35.81,2.780288e+09,NY,Consumer Cyclical,Internet Retail,2023-04-12
587,World Wrestling Entertainment Inc Class A Common,WWE,24.20,0.85,,108.2,4.897132e+09,NY,Communication Services,Entertainment,2023-04-12
