## 1. Short Data

In [9]:
import pandas as pd
import glob

# Get a list of all xlsx files in the "Resources" folder
file_paths = glob.glob('Resources/Short_Data/2023*.xlsx')

# Initialize an empty list to store individual DataFrames
dfs = []

# Iterate over each file path and read the Excel file into a DataFrame
for file_path in file_paths:
    df = pd.read_excel(file_path, engine='openpyxl')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Rename the 'ShortSqueeze.com Short Interest Data' column
combined_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)

# Drop unnecessary columns
columns_to_drop = ['Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo', 'Shares: Float',
                   'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High', '(abs)',
                   '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2', '% Insider Ownership', '% Institutional Ownership']

# Check if the columns exist in the dataframe before dropping them
columns_to_drop = [col for col in columns_to_drop if col in combined_df.columns]

combined_df.drop(columns_to_drop, axis=1, inplace=True)

# Convert 'Short % of Float' column to numeric
combined_df['Short % of Float'] = pd.to_numeric(combined_df['Short % of Float'], errors='coerce')

# Filter by Short % of Float >= 17
combined_df = combined_df[combined_df['Short % of Float'] >= 17]

# Convert 'Market Cap' column to numeric
combined_df['Market Cap'] = pd.to_numeric(combined_df['Market Cap'], errors='coerce')

# Drop columns where Market Cap is less than 300,000,000
combined_df = combined_df[combined_df['Market Cap'] >= 300000000]

# Replace 'Record Date' values
date_mapping = {
    'JanA': '01-11', 'JanB': '01-25',
    'FebA': '02-09', 'FebB': '02-27',
    'MarA': '03-09', 'MarB': '03-24',
    'AprA': '04-12', 'AprB': '04-25',
    'MayA': '05-09', 'MayB': '05-24',
    'JunA': '06-09', 'JunB': '06-27'
}

combined_df['Record Date'] = combined_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}')

# Reset the index
combined_df.reset_index(drop=True, inplace=True)

# Sort by 'Record Date' in ascending order
combined_df.sort_values('Record Date', inplace=True)

# Display the combined dataframe
combined_df



Unnamed: 0,Company Name,Symbol,Short % of Float,Price,Market Cap,Exchange,Sector,Industry,Record Date
0,Allogene Therapeutics Inc,ALLO,41.77,7.99,1.130905e+09,NAS,Healthcare,Biotechnology,2023-01-11
56,Sonic Automotive Inc,SAH,30.26,49.6,1.466672e+09,NY,Consumer Cyclical,Auto & Truck Dealerships,2023-01-11
55,Safehold Inc,SAFE,18.80,33.67,1.793264e+09,NY,Real Estate,REIT - Diversified,2023-01-11
54,Root Inc. - common stock,ROOT,21.17,5.26,4.570414e+08,NAS,Financial Services,Insurance - Property & Casualty,2023-01-11
53,Relay Therapeutics Inc. - Common Stock,RLAY,19.98,20.42,1.846581e+09,NAS,Healthcare,Biotechnology,2023-01-11
...,...,...,...,...,...,...,...,...,...
586,Ralph Lauren Corporation,RL,17.44,121.5,5.862375e+09,NY,Consumer Cyclical,Apparel Manufacturing,2023-06-09
587,Relay Therapeutics Inc. - Common Stock,RLAY,25.44,12.57,1.136705e+09,NAS,Healthcare,Biotechnology,2023-06-09
588,Root Inc. - common stock,ROOT,32.25,9.98,8.671622e+08,NAS,Financial Services,Insurance - Property & Casualty,2023-06-09
579,Childrens Place Inc (the,PLCE,30.44,22.11,3.243537e+08,NAS,Consumer Cyclical,Apparel Retail,2023-06-09


## 2. Insider Trading Data

In [11]:
from pathlib import Path
insider_df = pd.read_csv(Path("Resources/InsiderTrading.csv"))
insider_df.head()

Unnamed: 0,Insider,Symbol,Total Amount,Share Price,Num. of Shares,Relation,Date,Num. of Insiders,Transaction
0,Ra,IMRA,"$2,516,055.27",$4.00,629297,Large Shareholder,2023-01-03,1,Purchase
1,Mubadala,RXRX,"$1,063,010.74",$7.50,141646,Large Shareholder,2023-01-03,1,Purchase
2,Hightower,HPK,"$2,893,858.00",$22.00,131539,"Large Shareholder, Officer, Director",2023-01-04,1,Purchase
3,"Lord,",LFHAX,"$5,000,000.00",$10.00,500000,Other,2023-01-04,1,Purchase
4,Austin,CRWD,"$1,902,800.00",$95.14,20000,,2023-01-05,1,Purchase


In [12]:
common_symbols = pd.merge(combined_df, insider_df, on='Symbol')['Symbol'].drop_duplicates()
print(common_symbols)

0      RILY
11     SAVA
22     ZUMZ
30     LAZR
63     FATE
73      KSS
82     MYOV
83     ASAN
97     DISH
142     EVA
147    RCUS
150     HPK
Name: Symbol, dtype: object
