# Pedro - Short Queeze Predictor
---

### 1. Libraries Import

In [46]:
import numpy as np
import pandas as pd
import yfinance as yf
from pathlib import Path
import pandas_market_calendars as mcal
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced

### 2. Data Preparation Constants

In [4]:
SHORT_INTEREST_FLOAT_FILTER = 17
MARKET_CAP_FILTER = 300000000
TOTAL_AMOUNT_FILTER = 1000000
DESIRED_DAYS = [1, 2, 5, 7]
short_float_filepath = "Resources/ShortFloat.csv"
insider_trading_filepath = "Resources/InsiderTrading.csv"

### 3. Data Loading and Preprocessing Function

In [5]:
def load_and_preprocess_data(short_float_filepath, insider_trading_filepath):
    
    short_df = pd.read_csv(short_float_filepath)
    short_df.rename(columns={'ShortSqueeze.com Short Interest Data': 'Company Name'}, inplace=True)
    # Dropping irrelevant columns
    columns_to_drop = [
    'Total Short Interest', 'Days to Cover', 'Performance (52-wk)', 'Short: Prior Mo', '% Change Mo/Mo',
    'Shares: Float', 'Avg. Daily Vol.', 'Shares: Outstanding', 'Short Squeeze Ranking™', '% from 52-wk High',
    '(abs)', '% from 200 day MA', '(abs).1', '% from 50 day MA', '(abs).2', '% Insider Ownership',
    '% Institutional Ownership'
    ]
    columns_to_drop = [col for col in columns_to_drop if col in short_df.columns]
    short_df.drop(columns_to_drop, axis=1, inplace=True)
    # Convert 'Short % of Float' and 'Market Cap' to numeric and apply filters
    short_df['Short % of Float'] = pd.to_numeric(short_df['Short % of Float'], errors='coerce')
    short_df = short_df[short_df['Short % of Float'] >= SHORT_INTEREST_FLOAT_FILTER]
    short_df['Market Cap'] = pd.to_numeric(short_df['Market Cap'], errors='coerce')
    short_df = short_df[short_df['Market Cap'] >= MARKET_CAP_FILTER]
    
    insider_df = pd.read_csv(insider_trading_filepath)
    insider_df['Total Amount'] = insider_df['Total Amount'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    insider_df['Share Price'] = insider_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    insider_df['Date'] = pd.to_datetime(insider_df['Date'])
    insider_df['Total Amount'] = pd.to_numeric(insider_df['Total Amount'], errors='coerce')
    insider_df = insider_df[insider_df['Total Amount'] >= TOTAL_AMOUNT_FILTER]
    
    return short_df, insider_df

In [7]:
# Load and preprocess data
short_df, insider_df = load_and_preprocess_data(short_float_filepath, insider_trading_filepath)

  short_df = pd.read_csv(short_float_filepath)


### 4. Feature Engineering Function

In [8]:
def feature_engineering(short_df, insider_df):
    date_mapping = {
    'JanA': '01-11', 'JanB': '01-25',
    'FebA': '02-09', 'FebB': '02-27',
    'MarA': '03-09', 'MarB': '03-24',
    'AprA': '04-12', 'AprB': '04-25',
    'MayA': '05-09', 'MayB': '05-24',
    'JunA': '06-09', 'JunB': '06-27',
    'JulA': '07-12', 'JulB': '07-25',
    'AugA': '08-09', 'AugB': '08-24',
    'SepA': '09-12', 'SepB': '09-26',
    'OctA': '10-10', 'OctB': '10-24',
    'NovA': '11-09', 'NovB': '11-27',
    'DecA': '12-11', 'DecB': '12-27',
    }
    
    short_df['Record Date'] = pd.to_datetime(short_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}'))
    short_df.sort_values('Record Date', inplace=True)
    short_df.reset_index(drop=True, inplace=True)
    
    merged_df = pd.merge(short_df, insider_df, on='Symbol')
    merged_df['Share Price'] = merged_df['Share Price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
    merged_df = merged_df[['Symbol', 'Short % of Float', 'Total Amount', 'Record Date', 'Share Price', 'Company Name', 'Sector', 'Industry', 'Date']]
    
    # Calculate the difference between 'Date' and 'Record Date' for each row
    merged_df['Date_diff'] = (merged_df['Date'] - merged_df['Record Date']).dt.days
    # Filter out rows where 'Date_diff' is more than 30 and drop unnecessary columns
    merged_df = merged_df[merged_df['Date_diff'] >= 0]
    merged_df.sort_values(['Symbol', 'Date_diff'], inplace=True)
    merged_df.drop_duplicates(subset=['Symbol', 'Date'], keep='first', inplace=True)
    merged_df = merged_df[merged_df['Date_diff'] <= 30]
    merged_df.drop(columns=['Record Date', 'Date_diff'], inplace=True)

    # Reorder columns
    new_column_order = ['Symbol', 'Short % of Float', 'Total Amount', 'Date', 'Company Name', 'Sector', 'Industry']
    merged_df = merged_df[new_column_order]
    
    # Create new columns for Close Prices at future dates and calculate Returns.
    nyse = mcal.get_calendar('NYSE')
    desired_days = DESIRED_DAYS
    for day in desired_days:
        merged_df[f'Close Price Day {day}'] = np.nan

    for idx, row in merged_df.iterrows():
        trading_days = nyse.valid_days(start_date=row['Date'], end_date=row['Date'] + pd.DateOffset(days=10))

        for day in desired_days:
            if day <= len(trading_days):
                data = yf.download(row['Symbol'], start=trading_days[day - 1], end=trading_days[day - 1] + pd.DateOffset(days=1))
                if not data.empty:  
                    merged_df.loc[idx, f'Close Price Day {day}'] = data['Close'][0] 
    
    # Calculate Returns and Highest Day Return
    for day in DESIRED_DAYS: 
        merged_df[f'Return ({day} Days)'] = ((merged_df[f'Close Price Day {day}'] - merged_df['Close Price Day 1']) / merged_df['Close Price Day 1']) * 100
        
    merged_df['Highest Day Return'] = merged_df[[f'Return ({day} Days)' for day in [5, 7]]].max(axis=1)
    merged_df['Highest Close Price'] = merged_df[[f'Close Price Day {day}' for day in desired_days]].max(axis=1)

    for col in merged_df.columns:
        if 'Close Price' in col or 'Return' in col:
            merged_df[col] = merged_df[col].round(2)

    merged_df.dropna(inplace=True)
    merged_df.reset_index(drop=True, inplace=True)
    
    merged_df['Short Squeeze'] = 0
    merged_df.loc[merged_df['Highest Day Return'] >= 20, 'Short Squeeze'] = 1
    merged_df.loc[merged_df['Highest Day Return'] < 20, 'Short Squeeze'] = 0

    return merged_df

In [9]:
# Feature engineering
merged_df = feature_engineering(short_df, insider_df)

  short_df['Record Date'] = pd.to_datetime(short_df['Record Date'].str.replace(r'(\d{4})-(\w+)', lambda m: f'{m.group(1)}-{date_mapping[m.group(2)]}'))


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********


1 Failed download:
['MYOV']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['MYOV']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['MYOV']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['MYOV']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%*******


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:





['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')


[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed


1 Failed download:
['TCDA']: Exception('%ticker%: No timezone found, symbol may be delisted')



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


### 5. One Hot Encoding

In [12]:
def one_hot_encode(df):
    encoder = OneHotEncoder(sparse=False)
    categorical_variables = ['Company Name', 'Sector', 'Industry']
    encoded_df = pd.DataFrame(encoder.fit_transform(df[categorical_variables]))
    encoded_df.columns = encoder.get_feature_names_out(categorical_variables)
    numerical_df = df.drop(columns=categorical_variables, axis=1)
    return pd.concat([numerical_df, encoded_df], axis=1)

In [13]:
# One hot encode the data
merged_df = one_hot_encode(merged_df)



### 6. Train-Test Split

In [38]:
X = merged_df.drop(columns=['Short Squeeze'])
y = merged_df['Short Squeeze']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 7. Data Normalization

In [19]:
# Drop the 'Symbol' column from X_train and X_test
X_train_scaled = X_train.drop('Symbol', axis=1)
X_test_scaled = X_test.drop('Symbol', axis=1)

# Drop the 'Date' column from X_train and X_test
X_train_scaled = X_train_scaled.drop('Date', axis=1)
X_test_scaled = X_test_scaled.drop('Date', axis=1)

# Scale the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)

### 8. Model Training
#### Feedforward Neural Network (FNN)

In [20]:
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=2)

Epoch 1/50
2/2 - 0s - loss: 0.7687 - accuracy: 0.3774 - 315ms/epoch - 157ms/step
Epoch 2/50
2/2 - 0s - loss: 0.6607 - accuracy: 0.4906 - 4ms/epoch - 2ms/step
Epoch 3/50
2/2 - 0s - loss: 0.5748 - accuracy: 0.7358 - 4ms/epoch - 2ms/step
Epoch 4/50
2/2 - 0s - loss: 0.5041 - accuracy: 0.7925 - 4ms/epoch - 2ms/step
Epoch 5/50
2/2 - 0s - loss: 0.4430 - accuracy: 0.8491 - 4ms/epoch - 2ms/step
Epoch 6/50
2/2 - 0s - loss: 0.3992 - accuracy: 0.8868 - 4ms/epoch - 2ms/step
Epoch 7/50
2/2 - 0s - loss: 0.3594 - accuracy: 0.9057 - 4ms/epoch - 2ms/step
Epoch 8/50
2/2 - 0s - loss: 0.3255 - accuracy: 0.9057 - 5ms/epoch - 2ms/step
Epoch 9/50
2/2 - 0s - loss: 0.2997 - accuracy: 0.8868 - 4ms/epoch - 2ms/step
Epoch 10/50
2/2 - 0s - loss: 0.2784 - accuracy: 0.8868 - 3ms/epoch - 2ms/step
Epoch 11/50
2/2 - 0s - loss: 0.2581 - accuracy: 0.8868 - 4ms/epoch - 2ms/step
Epoch 12/50
2/2 - 0s - loss: 0.2414 - accuracy: 0.8868 - 4ms/epoch - 2ms/step
Epoch 13/50
2/2 - 0s - loss: 0.2251 - accuracy: 0.9057 - 3ms/epoch - 

<keras.callbacks.History at 0x7f87643d9ab0>

### 9. Model Evaluation

In [21]:
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 85.71%


### 10. Model Predictions

In [22]:
predictions = model.predict(X_test_scaled)
# Since this is a binary classification problem, we'll set a threshold at 0.5
predictions = [1 if pred > 0.5 else 0 for pred in predictions]



In [23]:
# You might want to compare these predictions with the actual values
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})

# Print out a sample of the comparison DataFrame
print(comparison_df.sample(10))

    Actual  Predicted
28       0          0
61       0          0
64       0          0
4        1          0
12       1          0
59       0          0
9        0          0
45       0          0
40       0          0
0        0          0


### 11. Performance Metrics

In [24]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        12
           1       0.00      0.00      0.00         2

    accuracy                           0.86        14
   macro avg       0.43      0.50      0.46        14
weighted avg       0.73      0.86      0.79        14

[[12  0]
 [ 2  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Save the model
model.save('my_model')

# Load the model later
from tensorflow.keras.models import load_model
loaded_model = load_model('my_model')

## Alternative Models
### Random Forest vs SMOTE

In [26]:
# Count the distinct values in the original labels data
y_train.value_counts()

0    45
1     8
Name: Short Squeeze, dtype: int64

In [27]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [28]:
# Instantiate SMOTE
smote_sampler = SMOTE(random_state=1, sampling_strategy='minority')

# Fit the SMOTE model to the training data
X_resampled, y_resampled = smote_sampler.fit_resample(X_train_scaled, y_train)

# Fit the RandomForestClassifier on the resampled data
model_resampled_rf = RandomForestClassifier()
model_resampled_rf.fit(X_resampled, y_resampled)

# Generate predictions based on the resampled data model
rf_resampled_predictions = model_resampled_rf.predict(X_test_scaled)

In [33]:
print(f'Original Data:\n{confusion_matrix(y_test, rf_predictions)}')
print(f'Oversampled Data:\n{confusion_matrix(y_test, rf_resampled_predictions)}')

Original Data:
[[12  0]
 [ 0  2]]
Oversampled Data:
[[12  0]
 [ 1  1]]


In [35]:
baso = balanced_accuracy_score(y_test, rf_predictions)
print(f'Original Data: {baso}')
basrs = balanced_accuracy_score(y_test, rf_resampled_predictions)
print(f'Oversampled Data: {basrs}')

Original Data: 1.0
Oversampled Data: 0.75


In [37]:
print(f'Original Data:\n{classification_report_imbalanced(y_test, rf_predictions)}')
print(f'Oversampled Data:\n{classification_report_imbalanced(y_test, rf_resampled_predictions)}')

Original Data:
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00        12
          1       1.00      1.00      1.00      1.00      1.00      1.00         2

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        14

Oversampled Data:
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      1.00      0.50      0.96      0.71      0.53        12
          1       1.00      0.50      1.00      0.67      0.71      0.48         2

avg / total       0.93      0.93      0.57      0.92      0.71      0.52        14

