# Pedro - Short Queeze Predictor

### Imports

In [None]:
# Here we are importing the necessary libraries for the project.
import os
import numpy as np
import pandas as pd
import yfinance as yf
from glob import glob
from pathlib import Path
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import pandas_market_calendars as mcal

### Environment Variables

In [None]:
# Load environment variables from a .env file.
load_dotenv('alpaca.env')

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Initialize Alpaca API
api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2"
)

### Short Interest Data Collection

In [None]:
# Here we are loading and preprocessing short interest data.

short_df = pd.read_csv(Path("Resources/ShortFloat.csv"))
short_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)

# Dropping irrelevant columns
columns_to_drop = [
    'Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo',
    'Shares: Float', 'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High',
    '(abs)', '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2', '% Insider Ownership',
    '% Institutional Ownership'
]
columns_to_drop = [col for col in columns_to_drop if col in short_df.columns]
short_df.drop(columns_to_drop, axis=1, inplace=True)

# Converting 'Short % of Float' and 'Market Cap' to numeric and applying filters
short_df['Short % of Float'] = pd.to_numeric(short_df['Short % of Float'], errors='coerce')
short_df = short_df[short_df['Short % of Float'] >= 17]
short_df['Market Cap'] = pd.to_numeric(short_df['Market Cap'], errors='coerce')
short_df = short_df[short_df['Market Cap'] >= 300000000]

In [None]:
short_df.head()

In [None]:
# ### Date Cleaning and Mapping
# Convert 'Record Date' column to datetime, sort the dataframe by 'Record Date'.
date_mapping = {
    'JanA': '01-11', 'JanB': '01-25',
    'FebA': '02-09', 'FebB': '02-27',
    'MarA': '03-09', 'MarB': '03-24',
    'AprA': '04-12', 'AprB': '04-25',
    'MayA': '05-09', 'MayB': '05-24',
    'JunA': '06-09', 'JunB': '06-27',
    'JulA': '07-12', 'JulB': '07-25',
    'AugA': '08-09', 'AugB': '08-24',
    'SepA': '09-12', 'SepB': '09-26',
    'OctA': '10-10', 'OctB': '10-24',
    'NovA': '11-09', 'NovB': '11-27',
    'DecA': '12-11', 'DecB': '12-27',
}
short_df['Record Date'] = pd.to_datetime(short_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}'))
short_df.sort_values('Record Date', inplace=True)
short_df.reset_index(drop=True, inplace=True)

In [None]:
short_df.head()

### Insider Trading Data Collection

In [None]:
# Load insider trading data, remove dollar symbols and convert 'Total Amount' and 'Share Price' to numeric.

insider_df = pd.read_csv("Resources/InsiderTrading.csv")
insider_df['Total Amount'] = insider_df['Total Amount'].replace({'\$': '', ',': ''}, regex=True).astype(float)
insider_df['Share Price'] = insider_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
insider_df['Date'] = pd.to_datetime(insider_df['Date'])
insider_df['Total Amount'] = pd.to_numeric(insider_df['Total Amount'], errors='coerce')
insider_df = insider_df[insider_df['Total Amount'] >= 1000000]

In [None]:
insider_df.head()

### Data Merge

In [None]:
# Merge short_df and insider_df based on the Symbol column, rename 'Share Price' to 'Close Price', select necessary columns

merged_df = pd.merge(short_df, insider_df, on='Symbol')
merged_df['Share Price'] = merged_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
merged_df.rename(columns={'Share Price': 'Close Price'}, inplace=True)
merged_df = merged_df[['Symbol', 'Short % of Float', 'Total Amount', 'Record Date', 'Close Price', 'Company Name', 'Sector', 'Industry', 'Date']]

### Data Cleaning and Filtering

In [None]:
# Calculate the difference between 'Date' and 'Record Date' for each row, filter out rows where 'Date_diff' is more than 30, and drop unnecessary columns.

merged_df['Date_diff'] = (merged_df['Date'] - merged_df['Record Date']).dt.days
merged_df = merged_df[merged_df['Date_diff'] >= 0]
merged_df.sort_values(['Symbol', 'Date_diff'], inplace=True)
merged_df.drop_duplicates(subset='Symbol', keep='first', inplace=True)
merged_df = merged_df[merged_df['Date_diff'] <= 30]
merged_df.drop(columns=['Record Date', 'Date_diff'], inplace=True)

# Reorder columns
new_column_order = ['Symbol', 'Short % of Float', 'Total Amount', 'Date', 'Close Price', 'Company Name', 'Sector', 'Industry']
merged_df = merged_df[new_column_order]

In [None]:
merged_df

### Data Augmentation

In [None]:
# Create new columns for Close Prices at future dates and calculate Returns.

nyse = mcal.get_calendar('NYSE')
desired_days = [2, 3, 4, 5, 7]
for day in desired_days:
    merged_df[f'Close Price Day {day}'] = np.nan

for idx, row in merged_df.iterrows():
    trading_days = nyse.valid_days(start_date=row['Date'], end_date=row['Date'] + pd.DateOffset(days=10))

    for day in desired_days:
        if day <= len(trading_days):
            data = yf.download(row['Symbol'], start=trading_days[day - 1], end=trading_days[day - 1] + pd.DateOffset(days=1))
            if not data.empty:  
                merged_df.loc[idx, f'Close Price Day {day}'] = data['Close'][0] 

### Calculate Returns and Highest Day Return

In [None]:
for day in [5, 7]: 
    merged_df[f'Return ({day} Days)'] = ((merged_df[f'Close Price Day {day}'] - merged_df['Close Price']) / merged_df['Close Price']) * 100

merged_df['Highest Day Return'] = merged_df[[f'Return ({day} Days)' for day in [5, 7]]].max(axis=1)
merged_df['Highest Close Price'] = merged_df[[f'Close Price Day {day}' for day in desired_days]].max(axis=1)

# Format the 'Close Price' and 'Return' columns to have 2 decimal places
for col in merged_df.columns:
    if 'Close Price' in col or 'Return' in col:
        merged_df[col] = merged_df[col].round(2)


# Drop NaNs and reset the index
merged_df.dropna(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

In [None]:
merged_df