# Pedro - Short Queeze Predictor
---

### 1. Libraries Import

In [31]:
import numpy as np
import pandas as pd
import yfinance as yf
from pathlib import Path
import pandas_market_calendars as mcal

### 2. Data Preparation Constants

In [32]:
DESIRED_DAYS = [1, 2, 5, 7, 15, 30]
SHORT_FLOAT_THRESHOLD = 17
MARKET_CAP_THRESHOLD = 300000000
INSIDER_AMOUNT_THRESHOLD = 500000
SHORT_FLOAT_FILEPATH = "Resources/ShortFloat.csv"
INSIDER_TRADING_FILEPATH = "Resources/InsiderTrading.csv"

### 3. Data Loading and Preprocessing Function

In [33]:
def load_and_preprocess_data(short_float_filepath, insider_trading_filepath):
    short_df = pd.read_csv(short_float_filepath)
    short_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)
    # Dropping irrelevant columns
    columns_to_drop = [
        'Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo',
        'Shares: Float', 'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High',
        '(abs)', '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2',
        '% Institutional Ownership'
    ]
    columns_to_drop = [col for col in columns_to_drop if col in short_df.columns]
    short_df.drop(columns_to_drop, axis=1, inplace=True)
    # Convert 'Short % of Float' and 'Market Cap' to numeric and apply filters
    short_df['Short % of Float'] = pd.to_numeric(short_df['Short % of Float'], errors='coerce')
    short_df = short_df[short_df['Short % of Float'] >= SHORT_FLOAT_THRESHOLD]
    short_df['Market Cap'] = pd.to_numeric(short_df['Market Cap'], errors='coerce')
    short_df = short_df[short_df['Market Cap'] >= MARKET_CAP_THRESHOLD]

    insider_df = pd.read_csv(insider_trading_filepath)
    insider_df['Total Amount'] = insider_df['Total Amount'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    insider_df['Share Price'] = insider_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    insider_df['Date'] = pd.to_datetime(insider_df['Date'])
    insider_df['Total Amount'] = pd.to_numeric(insider_df['Total Amount'], errors='coerce')
    insider_df = insider_df[insider_df['Total Amount'] >= INSIDER_AMOUNT_THRESHOLD]

    return short_df, insider_df

In [34]:
# Load and preprocess data
short_df, insider_df = load_and_preprocess_data(SHORT_FLOAT_FILEPATH, INSIDER_TRADING_FILEPATH)

  


### 4. Feature Engineering Function

In [35]:
def feature_engineering(short_df, insider_df):
    date_mapping = {
        'JanA': '01-11', 'JanB': '01-25',
        'FebA': '02-09', 'FebB': '02-27',
        'MarA': '03-09', 'MarB': '03-24',
        'AprA': '04-12', 'AprB': '04-25',
        'MayA': '05-09', 'MayB': '05-24',
        'JunA': '06-09', 'JunB': '06-27',
        'JulA': '07-12', 'JulB': '07-25',
        'AugA': '08-09', 'AugB': '08-24',
        'SepA': '09-12', 'SepB': '09-26',
        'OctA': '10-10', 'OctB': '10-24',
        'NovA': '11-09', 'NovB': '11-27',
        'DecA': '12-11', 'DecB': '12-27',
    }

    short_df['Record Date'] = pd.to_datetime(short_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}'))
    short_df.sort_values('Record Date', inplace=True)
    short_df.reset_index(drop=True, inplace=True)

    merged_df = pd.merge(short_df, insider_df, on='Symbol')
    merged_df['Share Price'] = merged_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    merged_df = merged_df[['Symbol', 'Short % of Float', 'Total Amount', 'Market Cap', '% Insider Ownership', 'Record Date', 'Relation', 'Share Price', 'Company Name', 'Sector', 'Industry', 'Date']]

    # Calculate the difference between 'Date' and 'Record Date' for each row
    merged_df['Date_diff'] = (merged_df['Date'] - merged_df['Record Date']).dt.days
    # Filter out rows where 'Date_diff' is more than 30 and drop unnecessary columns
    merged_df = merged_df[merged_df['Date_diff'] >= 0]
    merged_df.sort_values(['Symbol', 'Date_diff'], inplace=True)
    merged_df.drop_duplicates(subset=['Symbol', 'Date'], keep='first', inplace=True)
    merged_df = merged_df[merged_df['Date_diff'] <= 30]
    merged_df.drop(columns=['Record Date', 'Date_diff'], inplace=True)

    # Reorder columns
    new_column_order = ['Date','Symbol', 'Short % of Float', 'Total Amount', 'Market Cap', '% Insider Ownership', 'Relation', 'Company Name', 'Sector']
    merged_df = merged_df[new_column_order]

    # Create new columns for Close Prices at future dates and calculate Returns.
    nyse = mcal.get_calendar('NYSE')

    for day in DESIRED_DAYS:
        merged_df[f'Close Price Day {day}'] = np.nan

    for idx, row in merged_df.iterrows():
        trading_days = nyse.valid_days(start_date=row['Date'], end_date=row['Date'] + pd.DateOffset(days=45))

        for day in DESIRED_DAYS:
            if day <= len(trading_days):
                data = yf.download(row['Symbol'], start=trading_days[day - 1], end=trading_days[day - 1] + pd.DateOffset(days=1))
                if not data.empty:
                    merged_df.loc[idx, f'Close Price Day {day}'] = data['Close'][0]

    # Calculate Returns and Highest Day Return
    for day in DESIRED_DAYS:
        merged_df[f'Return ({day} Days)'] = ((merged_df[f'Close Price Day {day}'] - merged_df['Close Price Day 1']) / merged_df['Close Price Day 1']) * 100

    merged_df['Highest Day Return'] = merged_df[[f'Return ({day} Days)' for day in DESIRED_DAYS]].max(axis=1)
    merged_df['Highest Close Price'] = merged_df[[f'Close Price Day {day}' for day in DESIRED_DAYS]].max(axis=1)
    
    for col in merged_df.columns:
        if 'Close Price' in col or 'Return' in col:
            merged_df[col] = merged_df[col].round(2)


    merged_df.dropna(inplace=True)
    merged_df.reset_index(drop=True, inplace=True)

    merged_df['Short Squeeze'] = 0
    # Checking if Return (5 Days) and/or Return (7 Days) >= 10
    mask = ((merged_df['Return (5 Days)'] >= 10) | (merged_df['Return (7 Days)'] >= 10))
    merged_df.loc[mask, 'Short Squeeze'] = 1
    # Checking if Return (15 Days) >= 15
    mask = (merged_df['Return (15 Days)'] >= 15)
    merged_df.loc[mask, 'Short Squeeze'] = 1
    # Checking if Return (30 Days) >= 25
    mask = (merged_df['Return (30 Days)'] >= 25)
    merged_df.loc[mask, 'Short Squeeze'] = 1
    # Setting other cases to 0
    merged_df.loc[merged_df['Short Squeeze'] != 1, 'Short Squeeze'] = 0

    return merged_df

In [36]:
# Feature engineering
merged_df = feature_engineering(short_df, insider_df)

  app.launch_new_instance()


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- AN: Data doesn't exist for startDate = 1689897600, endDate = 1689984000
[*********************100%***********************]  1 of 1 completed
[********

In [37]:
merged_df.to_csv('Resources/ShortSqueezeData.csv', index=False)
print("Data saved to 'Resources/ShortSqueezeData.csv'")

Data saved to 'Resources/ShortSqueezeData.csv'


In [38]:
merged_df.head()

Unnamed: 0,Date,Symbol,Short % of Float,Total Amount,Market Cap,% Insider Ownership,Relation,Company Name,Sector,Close Price Day 1,...,Close Price Day 30,Return (1 Days),Return (2 Days),Return (5 Days),Return (7 Days),Return (15 Days),Return (30 Days),Highest Day Return,Highest Close Price,Short Squeeze
0,2022-02-09,ASAN,26.61,3498910.93,4969085000.0,23.3,"Large Shareholder, Officer, Director",Asana Inc. Class A Common Stock,Technology,61.03,...,44.18,0.0,5.88,19.94,10.08,-13.03,-27.61,19.94,73.2,1
1,2022-02-10,ASAN,26.61,1400743.32,4969085000.0,23.3,"Large Shareholder, Officer, Director",Asana Inc. Class A Common Stock,Technology,64.62,...,42.7,0.0,-4.74,14.72,-2.29,-24.68,-33.92,14.72,74.13,1
2,2022-02-11,ASAN,26.61,1035540.04,4969085000.0,23.3,"Large Shareholder, Officer, Director",Asana Inc. Class A Common Stock,Technology,61.56,...,37.9,0.0,11.53,9.13,-1.27,-26.19,-38.43,11.53,68.66,0
3,2022-02-16,ASAN,26.61,12355033.74,4969085000.0,23.3,"Large Shareholder, Officer, Director",Asana Inc. Class A Common Stock,Technology,74.13,...,41.69,0.0,-9.38,-36.87,-27.02,-34.16,-43.76,0.0,74.13,0
4,2022-02-17,ASAN,26.61,3026193.7,4969085000.0,23.3,"Large Shareholder, Officer, Director",Asana Inc. Class A Common Stock,Technology,67.18,...,39.97,0.0,-6.01,-19.1,-18.44,-43.42,-40.5,0.0,67.18,0
