In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf

In [6]:
# Import our input dataset
IPO_df = pd.read_csv("raw_dataset/company_overview.csv")
IPO_df

Unnamed: 0.1,Unnamed: 0,SYMBOL,ASSET_TYPE,company_name,EXCHANGE,CURRENCY,COUNTRY,SECTOR,INDUSTRY,ADDRESS
0,0,MRM,Common Stock,MEDIROM Healthcare Technologies Inc,NASDAQ,USD,USA,Consumer Cyclical,Personal Services,"Tradepia Odaiba, Tokyo, Japan, 135-0091"
1,1,VTAQ,Common Stock,Ventoux CCM Acquisition Corp,NASDAQ,USD,USA,Financial Services,Shell Companies,"1 East Putnam Avenue, Greenwich, CT, United St..."
2,2,IKT,Common Stock,"Inhibikase Therapeutics, Inc",NASDAQ,USD,USA,Healthcare,Biotechnology,"3350 Riverwood Parkway, Atlanta, GA, United St..."
3,3,GBS,Common Stock,GBS Inc,NASDAQ,USD,USA,Healthcare,Medical Devices,"708 3rd Avenue, New York, NY, United States, 1..."
4,4,HCAR,Common Stock,Healthcare Services Acquisition Corporation,NASDAQ,USD,USA,Financial Services,Shell Companies,"7809 Woodmont Avenue, Bethesda, MD, United Sta..."
...,...,...,...,...,...,...,...,...,...,...
656,656,RBKB,Common Stock,"Rhinebeck Bancorp, Inc",NASDAQ,USD,USA,Financial Services,Banks-Regional,"2 Jefferson Plaza, Poughkeepsie, NY, United St..."
657,657,BCOW,Common Stock,"1895 Bancorp of Wisconsin, Inc",NASDAQ,USD,USA,Financial Services,Banks-Regional,"7001 West Edgerton Avenue, Greenfield, WI, Uni..."
658,658,MDJH,Common Stock,MDJM Ltd,NASDAQ,USD,USA,Real Estate,Real Estate Services,"Saidun Center, Tianjin, China"
659,659,MTC,Common Stock,"Mmtec, Inc",NASDAQ,USD,USA,Technology,Software-Application,"Air China Century Building, Beijing, China, 10..."


In [14]:
return_df = pd.read_csv("raw_dataset/three_month_return.csv")
return_df

Unnamed: 0,SYMBOL,IPO_DATE,NAME,IPO_PRICE,CURRENT,RETURN,THREE_MTH_DATE,THREE_MTH_RETURN
0,MRM,2020-12-29,Medirom Healthcare,15.0,10.30,-0.3133,2021-03-29,0.0000
1,VTAQ,2020-12-24,Ventoux CCM Acquisition,10.0,9.89,-0.0110,2021-03-24,0.0000
2,IKT,2020-12-23,Inhibikase Therapeutics,10.0,6.21,-0.3790,2021-03-23,0.0000
3,GBS,2020-12-23,"GBS, Inc.",17.0,6.81,-0.5994,2021-03-23,0.0000
4,HCAR,2020-12-23,Healthcare Services Acquisition,10.0,9.85,-0.0155,2021-03-23,0.0000
...,...,...,...,...,...,...,...,...
705,RBKB,2019-01-17,Rhinebeck Bancorp,10.0,10.49,0.0490,2019-04-17,11.6600
706,BCOW,2019-01-09,1895 Bancorp of Wisconsin,10.0,14.94,0.4940,2019-04-09,9.6700
707,MDJH,2019-01-08,MDJM Ltd.,5.0,4.25,-0.1500,2019-04-08,3.4730
708,MTC,2019-01-08,"MMTec, Inc.",4.0,2.83,-0.2925,2019-04-08,4.3301


# Preprocess

In [12]:
# Generate our categorical variable list
IPO_cat = IPO_df.dtypes[IPO_df.dtypes == "object"].index.tolist()


# Check the number of unique values in each column
IPO_df[IPO_cat].nunique()

SYMBOL          661
ASSET_TYPE        1
company_name    661
EXCHANGE          7
CURRENCY          1
COUNTRY           1
SECTOR           12
INDUSTRY         73
ADDRESS         598
dtype: int64

# Encoding

In [13]:
# Binary encode
IPO_df_encoded = pd.get_dummies(IPO_df, columns=["EXCHANGE","SECTOR","INDUSTRY"])
IPO_df_encoded.head()

Unnamed: 0.1,Unnamed: 0,SYMBOL,ASSET_TYPE,company_name,CURRENCY,COUNTRY,ADDRESS,EXCHANGE_CSE,EXCHANGE_NASDAQ,EXCHANGE_NYSE,...,INDUSTRY_Solar,INDUSTRY_Specialty Chemicals,INDUSTRY_Specialty Industrial Machinery,INDUSTRY_Specialty Retail,INDUSTRY_Staffing & Employment Services,INDUSTRY_Telecom Services,INDUSTRY_Thermal Coal,INDUSTRY_Trucking,INDUSTRY_Utilities-Regulated Gas,INDUSTRY_Waste Management
0,0,MRM,Common Stock,MEDIROM Healthcare Technologies Inc,USD,USA,"Tradepia Odaiba, Tokyo, Japan, 135-0091",0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,VTAQ,Common Stock,Ventoux CCM Acquisition Corp,USD,USA,"1 East Putnam Avenue, Greenwich, CT, United St...",0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,IKT,Common Stock,"Inhibikase Therapeutics, Inc",USD,USA,"3350 Riverwood Parkway, Atlanta, GA, United St...",0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,GBS,Common Stock,GBS Inc,USD,USA,"708 3rd Avenue, New York, NY, United States, 1...",0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,HCAR,Common Stock,Healthcare Services Acquisition Corporation,USD,USA,"7809 Woodmont Avenue, Bethesda, MD, United Sta...",0,1,0,...,0,0,0,0,0,0,0,0,0,0


# Target Feature

In [None]:
y = IPO_df_encoded.THREE_MTH_RETURN.values
X = IPO_df_encoded.drop(columns=["THREE_MTH_RETURN"]).values

# Split Training and Testing sets

In [None]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Deep Learning

In [None]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  10
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model 
fit_model = nn.fit(X_train_scaled, y_train, epochs=50) 
# Evaluate the model using the test data 
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")