# Pedro - Short Queeze Predictor
---

### Imports

In [1]:
# Import necessary libraries for the project
import os
import numpy as np
import pandas as pd
import yfinance as yf
from glob import glob
from pathlib import Path
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import pandas_market_calendars as mcal
from sklearn.preprocessing import StandardScaler,OneHotEncoder

### Environment Variables

In [2]:
# Load environment variables from a .env file.
load_dotenv('alpaca.env')

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Initialize Alpaca API
api = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2"
)

### Short Interest Data Collection

In [3]:
# Load and preprocess short interest data
short_df = pd.read_csv(Path("Resources/ShortFloat.csv"))
short_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)

# Dropping irrelevant columns
columns_to_drop = [
    'Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo',
    'Shares: Float', 'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High',
    '(abs)', '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2', '% Insider Ownership',
    '% Institutional Ownership'
]
columns_to_drop = [col for col in columns_to_drop if col in short_df.columns]
short_df.drop(columns_to_drop, axis=1, inplace=True)

# Convert 'Short % of Float' and 'Market Cap' to numeric and apply filters
short_df['Short % of Float'] = pd.to_numeric(short_df['Short % of Float'], errors='coerce')
short_df = short_df[short_df['Short % of Float'] >= 17]
short_df['Market Cap'] = pd.to_numeric(short_df['Market Cap'], errors='coerce')
short_df = short_df[short_df['Market Cap'] >= 300000000]

  exec(code_obj, self.user_global_ns, self.user_ns)


### Date Cleaning and Mapping

In [4]:
# Convert 'Record Date' column to datetime, sort the dataframe by 'Record Date'.
date_mapping = {
    'JanA': '01-11', 'JanB': '01-25',
    'FebA': '02-09', 'FebB': '02-27',
    'MarA': '03-09', 'MarB': '03-24',
    'AprA': '04-12', 'AprB': '04-25',
    'MayA': '05-09', 'MayB': '05-24',
    'JunA': '06-09', 'JunB': '06-27',
    'JulA': '07-12', 'JulB': '07-25',
    'AugA': '08-09', 'AugB': '08-24',
    'SepA': '09-12', 'SepB': '09-26',
    'OctA': '10-10', 'OctB': '10-24',
    'NovA': '11-09', 'NovB': '11-27',
    'DecA': '12-11', 'DecB': '12-27',
}
short_df['Record Date'] = pd.to_datetime(short_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}'))
short_df.sort_values('Record Date', inplace=True)
short_df.reset_index(drop=True, inplace=True)

  


### Insider Trading Data Collection

In [5]:
# Load insider trading data, remove dollar symbols, and convert columns to numeric
insider_df = pd.read_csv("Resources/InsiderTrading.csv")
insider_df['Total Amount'] = insider_df['Total Amount'].replace({'\$': '', ',': ''}, regex=True).astype(float)
insider_df['Share Price'] = insider_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
insider_df['Date'] = pd.to_datetime(insider_df['Date'])
insider_df['Total Amount'] = pd.to_numeric(insider_df['Total Amount'], errors='coerce')
insider_df = insider_df[insider_df['Total Amount'] >= 1000000]

### Data Merge

In [6]:
# Merge short_df and insider_df based on the Symbol column, rename columns, and select necessary columns
merged_df = pd.merge(short_df, insider_df, on='Symbol')
merged_df['Share Price'] = merged_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
merged_df.rename(columns={'Share Price': 'Close Price'}, inplace=True)
merged_df = merged_df[['Symbol', 'Short % of Float', 'Total Amount', 'Record Date', 'Close Price', 'Company Name', 'Sector', 'Industry', 'Date']]

### Data Cleaning and Filtering

In [7]:
# Calculate the difference between 'Date' and 'Record Date' for each row
merged_df['Date_diff'] = (merged_df['Date'] - merged_df['Record Date']).dt.days

# Filter out rows where 'Date_diff' is more than 30 and drop unnecessary columns
merged_df = merged_df[merged_df['Date_diff'] >= 0]
merged_df.sort_values(['Symbol', 'Date_diff'], inplace=True)
merged_df.drop_duplicates(subset='Symbol', keep='first', inplace=True)
merged_df = merged_df[merged_df['Date_diff'] <= 30]
merged_df.drop(columns=['Record Date', 'Date_diff'], inplace=True)

# Reorder columns
new_column_order = ['Symbol', 'Short % of Float', 'Total Amount', 'Date', 'Close Price', 'Company Name', 'Sector', 'Industry']
merged_df = merged_df[new_column_order]

### Data Augmentation

In [8]:
# Create new columns for Close Prices at future dates and calculate Returns.
nyse = mcal.get_calendar('NYSE')
desired_days = [2, 3, 4, 5, 7]
for day in desired_days:
    merged_df[f'Close Price Day {day}'] = np.nan

for idx, row in merged_df.iterrows():
    trading_days = nyse.valid_days(start_date=row['Date'], end_date=row['Date'] + pd.DateOffset(days=10))

    for day in desired_days:
        if day <= len(trading_days):
            data = yf.download(row['Symbol'], start=trading_days[day - 1], end=trading_days[day - 1] + pd.DateOffset(days=1))
            if not data.empty:  
                merged_df.loc[idx, f'Close Price Day {day}'] = data['Close'][0] 

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

### Calculate Returns and Highest Day Return

In [9]:
for day in [5, 7]: 
    merged_df[f'Return ({day} Days)'] = ((merged_df[f'Close Price Day {day}'] - merged_df['Close Price']) / merged_df['Close Price']) * 100

merged_df['Highest Day Return'] = merged_df[[f'Return ({day} Days)' for day in [5, 7]]].max(axis=1)
merged_df['Highest Close Price'] = merged_df[[f'Close Price Day {day}' for day in desired_days]].max(axis=1)

# Format the 'Close Price' and 'Return' columns to have 2 decimal places
for col in merged_df.columns:
    if 'Close Price' in col or 'Return' in col:
        merged_df[col] = merged_df[col].round(2)

# Drop NaNs and reset the index
merged_df.dropna(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

In [10]:
merged_df.head()

Unnamed: 0,Symbol,Short % of Float,Total Amount,Date,Close Price,Company Name,Sector,Industry,Close Price Day 2,Close Price Day 3,Close Price Day 4,Close Price Day 5,Close Price Day 7,Return (5 Days),Return (7 Days),Highest Day Return,Highest Close Price
0,AKRO,26.46,10400000.0,2022-09-19,26.0,Akero Therapeutics Inc,Healthcare,Biotechnology,26.27,26.44,26.18,25.91,26.65,-0.35,2.5,2.5,26.65
1,ASAN,26.61,3498910.93,2022-02-09,61.03,Asana Inc. Class A Common Stock,Technology,Software - Application,64.62,61.56,68.66,73.2,67.18,19.94,10.08,19.94,73.2
2,CHWY,22.87,4616640.16,2022-06-13,27.48,Chewy Inc Class A,Consumer Cyclical,Internet Retail,27.05,29.24,28.43,28.7,32.62,4.44,18.7,18.7,32.62
3,DCPH,17.99,14789780.0,2022-04-29,10.0,Deciphera Pharmaceuticals Inc,Healthcare,Biotechnology,10.91,10.55,10.77,10.63,9.11,6.3,-8.9,6.3,10.91
4,DISH,20.22,6230000.0,2023-05-11,6.23,Dish Network Corporation - Class A,Communication Services,Entertainment,6.16,6.57,6.48,6.86,6.7,10.11,7.54,10.11,6.86


In [18]:
merged_df = merged_df[merged_df['Highest Day Return'] >= 15]
merged_df

Unnamed: 0,Symbol,Short % of Float,Total Amount,Date,Close Price,Company Name,Sector,Industry,Close Price Day 2,Close Price Day 3,Close Price Day 4,Close Price Day 5,Close Price Day 7,Return (5 Days),Return (7 Days),Highest Day Return,Highest Close Price
1,ASAN,26.61,3498910.93,2022-02-09,61.03,Asana Inc. Class A Common Stock,Technology,Software - Application,64.62,61.56,68.66,73.2,67.18,19.94,10.08,19.94,73.2
2,CHWY,22.87,4616640.16,2022-06-13,27.48,Chewy Inc Class A,Consumer Cyclical,Internet Retail,27.05,29.24,28.43,28.7,32.62,4.44,18.7,18.7,32.62
5,ESTE,29.3,1705050.0,2022-09-23,11.37,Earthstone Energy Inc Class A,Energy,Oil & Gas E&P,10.77,10.89,11.9,12.21,13.57,7.39,19.35,19.35,13.57
10,IGMS,19.91,1028550.0,2022-11-08,17.14,Igm Biosciences Inc,Healthcare,Biotechnology,16.05,20.93,25.38,24.96,25.7,45.62,49.94,49.94,25.7
15,RCKT,19.23,19999997.0,2022-10-06,14.75,Rocket Pharmaceuticals Inc,Healthcare,Biotechnology,17.08,16.72,16.48,17.23,16.34,16.81,10.78,16.81,17.23


In [11]:
# Transpose the dataframe for further analysis
transposed_df = merged_df.T
transposed_df = transposed_df.drop(['Company Name', 'Date', 'Sector', 'Industry'])
transposed_df.set_axis(transposed_df.loc['Symbol'], axis=1, inplace=True)
transposed_df.drop('Symbol', inplace=True)

# Display the transposed dataframe
transposed_df.head()

Symbol,AKRO,ASAN,CHWY,DCPH,DISH,ESTE,EVA,FATE,GME,GRPN,...,PRCH,RCKT,RCUS,RILY,RVMD,SAH,SAVA,SBOW,UWMC,W
Short % of Float,26.46,26.61,22.87,17.99,20.22,29.3,17.13,29.95,18.59,20.28,...,21.49,19.23,21.54,18.46,19.38,19.33,27.62,25.85,24.79,21.38
Total Amount,10400000.0,3498910.93,4616640.16,14789780.0,6230000.0,1705050.0,1528166.9,1586029.02,2039367.7,1352916.24,...,1000002.39,19999997.0,19452600.0,1186397.5,1191626.94,5339927.75,1990629.76,3704113.25,1010056.36,1222400.0
Close Price,26.0,61.03,27.48,10.0,6.23,11.37,53.25,6.18,101.37,19.59,...,7.59,14.75,19.26,47.46,24.26,41.89,25.76,30.05,3.64,122.24
Close Price Day 2,26.27,64.62,27.05,10.91,6.16,10.77,57.25,6.34,35.25,19.37,...,8.11,17.08,19.71,47.42,24.33,39.6,24.97,29.53,3.79,124.81
Close Price Day 3,26.44,61.56,29.24,10.55,6.57,10.89,59.35,6.03,35.6,19.12,...,7.69,16.72,20.31,47.24,24.67,41.01,24.79,26.23,3.9,126.23


## Data Preparation

In [12]:
# Review the data types associated with the columns
merged_df.dtypes

Symbol                         object
Short % of Float              float64
Total Amount                  float64
Date                   datetime64[ns]
Close Price                   float64
Company Name                   object
Sector                         object
Industry                       object
Close Price Day 2             float64
Close Price Day 3             float64
Close Price Day 4             float64
Close Price Day 5             float64
Close Price Day 7             float64
Return (5 Days)               float64
Return (7 Days)               float64
Highest Day Return            float64
Highest Close Price           float64
dtype: object

In [13]:
# Create a list of categorical variables 
categorical_variables = list(merged_df.select_dtypes("object").columns)

# Display the categorical variables list
display(categorical_variables)

['Symbol', 'Company Name', 'Sector', 'Industry']

In [14]:
enc = OneHotEncoder(sparse=False)

In [15]:
# Encode the categorical variables using OneHotEncoder
encoded_data = enc.fit_transform(merged_df[categorical_variables])

In [16]:
# Create a DataFrame with the encoded variables
encoded_df = pd.DataFrame(encoded_data, columns = enc.get_feature_names(categorical_variables))

# Review the DataFrame
encoded_df.head()



Unnamed: 0,Symbol_AKRO,Symbol_ASAN,Symbol_CHWY,Symbol_DCPH,Symbol_DISH,Symbol_ESTE,Symbol_EVA,Symbol_FATE,Symbol_GME,Symbol_GRPN,...,Industry_Department Stores,Industry_Entertainment,Industry_Financial Conglomerates,Industry_Internet Content & Information,Industry_Internet Retail,Industry_Lumber & Wood Production,Industry_Mortgage Finance,Industry_Oil & Gas E&P,Industry_Software - Application,Industry_Specialty Retail
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Add the numerical variables from the original DataFrame to the one-hot encoding DataFrame
encoded_df = pd.concat([merged_df.select_dtypes(["int64", "float64"]), encoded_df], axis=1)

# Review the DataFrame
encoded_df.head()

Unnamed: 0,Short % of Float,Total Amount,Close Price,Close Price Day 2,Close Price Day 3,Close Price Day 4,Close Price Day 5,Close Price Day 7,Return (5 Days),Return (7 Days),...,Industry_Department Stores,Industry_Entertainment,Industry_Financial Conglomerates,Industry_Internet Content & Information,Industry_Internet Retail,Industry_Lumber & Wood Production,Industry_Mortgage Finance,Industry_Oil & Gas E&P,Industry_Software - Application,Industry_Specialty Retail
0,26.46,10400000.0,26.0,26.27,26.44,26.18,25.91,26.65,-0.35,2.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,26.61,3498910.93,61.03,64.62,61.56,68.66,73.2,67.18,19.94,10.08,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,22.87,4616640.16,27.48,27.05,29.24,28.43,28.7,32.62,4.44,18.7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,17.99,14789780.0,10.0,10.91,10.55,10.77,10.63,9.11,6.3,-8.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20.22,6230000.0,6.23,6.16,6.57,6.48,6.86,6.7,10.11,7.54,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
