In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import tensorflow as tf
import psycopg2

In [2]:
conn = psycopg2.connect("dbname=ipo-database user=postgres password=0nlineSQL")

In [4]:
master_df = pd.read_sql_query("SELECT * from master_data;", conn)
master_df.head()

Unnamed: 0,symbol,trade_date,issuer,lead_jointlead_managers,offer_price,opening_price,firstday_close,firstday_percent_pxchng,dollar_change_opening,dollar_change_close,...,div_payout,div_payout_commonstock,div_payout_preferredstock,proceeds_fromissuance_commonstock,proceeds_fromissuance_longterm_debtcapital_secnet,proceeds_fromissuance_preferredstock,proceeds_fromrepurchase_equity,proceeds_fromsale_treasurystock,changein_cash_cashequivalents,changein_exchangerate
0,BSN.U,2020-09-11,Broadstone Acquisition,Citigroup,$10.00,$9.84,$9.60,-0.04,-$0.16,-$0.40,...,,,,,,,,,,
1,LEAP.U,2020-09-11,Ribbit LEAP,JPMorgan,$10.00,$11.20,$11.55,0.16,$1.20,$1.55,...,,,,,,,,,,
2,SNPR.U,2020-09-11,Tortoise Acquisition Corp. II,Barclays/ Goldman Sachs,$10.00,$10.35,$10.51,0.05,$0.35,$0.51,...,,,,,,,,,,
3,TWCTU,2020-09-11,TWC Tech Holdings II,Citigroup/ Deutsche Bank Securities,$10.00,$10.08,$10.07,0.01,$0.08,$0.07,...,,,,,,,,,,
4,NSH.U,2020-09-10,NavSight Holdings,Credit Suisse,$10.00,$10.00,$10.00,0.0,$0.00,$0.00,...,,,,,,,,,,


In [5]:
list(master_df)

['symbol',
 'trade_date',
 'issuer',
 'lead_jointlead_managers',
 'offer_price',
 'opening_price',
 'firstday_close',
 'firstday_percent_pxchng',
 'dollar_change_opening',
 'dollar_change_close',
 'star_ratings',
 'performed',
 'asset_type',
 'company_name',
 'exchange',
 'currency',
 'country',
 'sector',
 'industry',
 'address',
 'three_mth_date',
 'three_mth_ipo',
 'price_change',
 'three_mth_return',
 'price_gain_loss',
 'fiscaldate_end',
 'reported_currency',
 'gross_profit',
 'total_revenue',
 'costof_revenue',
 'costof_goodservices_sold',
 'operating_income',
 'selling_gen_admin',
 'research_dev',
 'operating_expenses',
 'investment_income_net',
 'net_interest_income',
 'interest_income',
 'interest_expense',
 'noninterest_income',
 'othernon_operatingincome',
 'depreciation',
 'dpr_and_amort',
 'income_beforetax',
 'incometax_expense',
 'interest_debt_expense',
 'netincome_cont_operations',
 'comprehensive_income_netoftax',
 'ebit',
 'ebitda',
 'netincome',
 'total_assets',
 't

# Calculated Columns

In [11]:
# Net profit margin
master_df["net_profit_margin"] = master_df["netincome"]/master_df["total_revenue"]
master_df["net_profit_margin"]

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
3465   NaN
3466   NaN
3467   NaN
3468   NaN
3469   NaN
Name: net_profit_margin, Length: 3470, dtype: float64

In [10]:
# Calculate Gross Profit Margin = gross_profit/total_revenue
master_df["gross_profit_margin"] = master_df["gross_profit"]/master_df["total_revenue"]
master_df["gross_profit_margin"]

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
        ..
3465   NaN
3466   NaN
3467   NaN
3468   NaN
3469   NaN
Name: gross_profit_margin, Length: 3470, dtype: float64

### Calculate Debt Asset Ratio:

The formula for the debt to asset ratio is as follows: Debt/Asset = (Short-term Debt + Long-term Debt) / Total Assets. Where: Total Assets may include all current and non-current assets on the company's balance sheet, or may only include certain assets such as Property, Plant & Equipment (PP&E)

In [16]:
# Debt to Asset Ratio
master_df["debt_asset_ratio"] = master_df["short_longterm_debt_total"]/master_df["total_assets"]
master_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,symbol,trade_date,issuer,lead_jointlead_managers,offer_price,opening_price,firstday_close,firstday_percent_pxchng,dollar_change_opening,dollar_change_close,...,proceeds_fromissuance_commonstock,proceeds_fromissuance_longterm_debtcapital_secnet,proceeds_fromissuance_preferredstock,proceeds_fromrepurchase_equity,proceeds_fromsale_treasurystock,changein_cash_cashequivalents,changein_exchangerate,gross_profit_margin,net_profit_margin,debt_asset_ratio
15,AUVI,2020-08-31,Applied UV,"Network 1 Financial Securities,",$5.00,$5.75,$11.60,1.32,$0.75,$6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050094,-0.573314,0.053702
16,HCDI,2020-08-28,Harbor Custom Development,ThinkEquity (a division of Fordham Financial M...,$6.00,$5.50,$7.50,0.25,-$0.50,$1.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079754,-0.055992,0.658664
19,XPEV,2020-08-27,XPeng,Credit Suisse/ J.P. Morgan/ BofA Securities,$15.00,$23.10,$21.22,0.41,$8.10,$6.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045998,-0.577232,0.000000
39,BEKE,2020-08-13,KE Holdings,Goldman Sachs/ Morgan Stanley/ China Renaissance,$20.00,$35.06,$37.44,0.87,$15.06,$17.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213307,0.138040,0.000000
46,IBEX,2020-08-07,IBEX Ltd.,Citigroup/ RBC Capital Markets/ Baird,$19.00,$18.00,$15.40,-0.19,-$1.00,-$3.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315930,-0.037718,0.160104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,PCAP,2005-07-28,Patriot Capital Funding,AG Edwards,$14.00,$14.75,$14.05,0.00,$0.75,$0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.108949,0.000000
2722,EVVV,2005-06-15,ev3,Piper Jaffray/Banc of America,$14.00,$13.50,$14.15,0.01,-$0.50,$0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
2907,SRVY,2004-07-15,Greenfield Online,Lehman Brothers,$13.00,$16.00,$18.70,0.44,$3.00,$5.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.168834,0.000000
2932,LDIS,2004-06-15,Leadis Technology,Goldman Sachs/Merrill Lynch,$14.00,$14.27,$13.10,-0.06,$0.27,-$0.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.150321,0.000000


### Drop rows with N/As

In [13]:
master_df = master_df.dropna(subset=["gross_profit_margin"])
master_df

Unnamed: 0,symbol,trade_date,issuer,lead_jointlead_managers,offer_price,opening_price,firstday_close,firstday_percent_pxchng,dollar_change_opening,dollar_change_close,...,proceeds_fromissuance_commonstock,proceeds_fromissuance_longterm_debtcapital_secnet,proceeds_fromissuance_preferredstock,proceeds_fromrepurchase_equity,proceeds_fromsale_treasurystock,changein_cash_cashequivalents,changein_exchangerate,gross_profit_margin,net_profit_margin,debt_asset_ratio
15,AUVI,2020-08-31,Applied UV,"Network 1 Financial Securities,",$5.00,$5.75,$11.60,1.32,$0.75,$6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050094,-0.573314,0.053702
16,HCDI,2020-08-28,Harbor Custom Development,ThinkEquity (a division of Fordham Financial M...,$6.00,$5.50,$7.50,0.25,-$0.50,$1.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079754,-0.055992,0.658664
19,XPEV,2020-08-27,XPeng,Credit Suisse/ J.P. Morgan/ BofA Securities,$15.00,$23.10,$21.22,0.41,$8.10,$6.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045998,-0.577232,0.000000
39,BEKE,2020-08-13,KE Holdings,Goldman Sachs/ Morgan Stanley/ China Renaissance,$20.00,$35.06,$37.44,0.87,$15.06,$17.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213307,0.138040,0.000000
46,IBEX,2020-08-07,IBEX Ltd.,Citigroup/ RBC Capital Markets/ Baird,$19.00,$18.00,$15.40,-0.19,-$1.00,-$3.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315930,-0.037718,0.160104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2722,EVVV,2005-06-15,ev3,Piper Jaffray/Banc of America,$14.00,$13.50,$14.15,0.01,-$0.50,$0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
2823,ATB,2004-11-04,Arlington Tankers Ltd.,UBS Investment Bank/Jefferies,$20.00,$22.30,$22.04,0.10,$2.30,$2.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.383139,
2907,SRVY,2004-07-15,Greenfield Online,Lehman Brothers,$13.00,$16.00,$18.70,0.44,$3.00,$5.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.168834,0.000000
2932,LDIS,2004-06-15,Leadis Technology,Goldman Sachs/Merrill Lynch,$14.00,$14.27,$13.10,-0.06,$0.27,-$0.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.150321,0.000000


In [15]:
master_df = master_df.dropna(subset=["debt_asset_ratio"])
master_df

Unnamed: 0,symbol,trade_date,issuer,lead_jointlead_managers,offer_price,opening_price,firstday_close,firstday_percent_pxchng,dollar_change_opening,dollar_change_close,...,proceeds_fromissuance_commonstock,proceeds_fromissuance_longterm_debtcapital_secnet,proceeds_fromissuance_preferredstock,proceeds_fromrepurchase_equity,proceeds_fromsale_treasurystock,changein_cash_cashequivalents,changein_exchangerate,gross_profit_margin,net_profit_margin,debt_asset_ratio
15,AUVI,2020-08-31,Applied UV,"Network 1 Financial Securities,",$5.00,$5.75,$11.60,1.32,$0.75,$6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050094,-0.573314,0.053702
16,HCDI,2020-08-28,Harbor Custom Development,ThinkEquity (a division of Fordham Financial M...,$6.00,$5.50,$7.50,0.25,-$0.50,$1.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079754,-0.055992,0.658664
19,XPEV,2020-08-27,XPeng,Credit Suisse/ J.P. Morgan/ BofA Securities,$15.00,$23.10,$21.22,0.41,$8.10,$6.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045998,-0.577232,0.000000
39,BEKE,2020-08-13,KE Holdings,Goldman Sachs/ Morgan Stanley/ China Renaissance,$20.00,$35.06,$37.44,0.87,$15.06,$17.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213307,0.138040,0.000000
46,IBEX,2020-08-07,IBEX Ltd.,Citigroup/ RBC Capital Markets/ Baird,$19.00,$18.00,$15.40,-0.19,-$1.00,-$3.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315930,-0.037718,0.160104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,PCAP,2005-07-28,Patriot Capital Funding,AG Edwards,$14.00,$14.75,$14.05,0.00,$0.75,$0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.108949,0.000000
2722,EVVV,2005-06-15,ev3,Piper Jaffray/Banc of America,$14.00,$13.50,$14.15,0.01,-$0.50,$0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
2907,SRVY,2004-07-15,Greenfield Online,Lehman Brothers,$13.00,$16.00,$18.70,0.44,$3.00,$5.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.168834,0.000000
2932,LDIS,2004-06-15,Leadis Technology,Goldman Sachs/Merrill Lynch,$14.00,$14.27,$13.10,-0.06,$0.27,-$0.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.150321,0.000000


In [28]:
master_df.replace([np.inf, -np.inf], np.nan)

Unnamed: 0,symbol,trade_date,issuer,lead_jointlead_managers,offer_price,opening_price,firstday_close,firstday_percent_pxchng,dollar_change_opening,dollar_change_close,...,proceeds_fromissuance_commonstock,proceeds_fromissuance_longterm_debtcapital_secnet,proceeds_fromissuance_preferredstock,proceeds_fromrepurchase_equity,proceeds_fromsale_treasurystock,changein_cash_cashequivalents,changein_exchangerate,gross_profit_margin,net_profit_margin,debt_asset_ratio
15,AUVI,2020-08-31,Applied UV,"Network 1 Financial Securities,",$5.00,$5.75,$11.60,1.32,$0.75,$6.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050094,-0.573314,0.053702
16,HCDI,2020-08-28,Harbor Custom Development,ThinkEquity (a division of Fordham Financial M...,$6.00,$5.50,$7.50,0.25,-$0.50,$1.50,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079754,-0.055992,0.658664
19,XPEV,2020-08-27,XPeng,Credit Suisse/ J.P. Morgan/ BofA Securities,$15.00,$23.10,$21.22,0.41,$8.10,$6.22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045998,-0.577232,0.000000
39,BEKE,2020-08-13,KE Holdings,Goldman Sachs/ Morgan Stanley/ China Renaissance,$20.00,$35.06,$37.44,0.87,$15.06,$17.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213307,0.138040,0.000000
46,IBEX,2020-08-07,IBEX Ltd.,Citigroup/ RBC Capital Markets/ Baird,$19.00,$18.00,$15.40,-0.19,-$1.00,-$3.60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.315930,-0.037718,0.160104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,PCAP,2005-07-28,Patriot Capital Funding,AG Edwards,$14.00,$14.75,$14.05,0.00,$0.75,$0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.108949,0.000000
2722,EVVV,2005-06-15,ev3,Piper Jaffray/Banc of America,$14.00,$13.50,$14.15,0.01,-$0.50,$0.15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
2907,SRVY,2004-07-15,Greenfield Online,Lehman Brothers,$13.00,$16.00,$18.70,0.44,$3.00,$5.70,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.168834,0.000000
2932,LDIS,2004-06-15,Leadis Technology,Goldman Sachs/Merrill Lynch,$14.00,$14.27,$13.10,-0.06,$0.27,-$0.90,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.150321,0.000000


# Preprocess

In [29]:
IPO_df = master_df[["exchange","sector","industry","price_gain_loss","debt_asset_ratio", "gross_profit_margin","net_profit_margin"]]
IPO_df

Unnamed: 0,exchange,sector,industry,price_gain_loss,debt_asset_ratio,gross_profit_margin,net_profit_margin
15,NASDAQ,Consumer Cyclical,"Furnishings, Fixtures & Appliances",Loss,0.053702,0.050094,-0.573314
16,NASDAQ,Real Estate,Real Estate-Development,,0.658664,0.079754,-0.055992
19,NYSE,Consumer Cyclical,Auto Manufacturers,Gain,0.000000,0.045998,-0.577232
39,NYSE,Real Estate,Real Estate Services,Gain,0.000000,0.213307,0.138040
46,NASDAQ,Technology,Software-Application,,0.160104,0.315930,-0.037718
...,...,...,...,...,...,...,...
2680,NASDAQ,Other,Other,Loss,0.000000,0.000000,0.108949
2722,NASDAQ,Other,Other,Gain,0.000000,0.000000,0.000000
2907,NASDAQ,Other,Other,Gain,0.000000,0.000000,0.168834
2932,NASDAQ,Other,Other,Loss,0.000000,0.000000,0.150321


In [30]:
IPO_df.nunique()

exchange                 7
sector                  12
industry                81
price_gain_loss          2
debt_asset_ratio       143
gross_profit_margin    331
net_profit_margin      444
dtype: int64

# Encoding

In [31]:
# Binary encode
IPO_df_encoded = pd.get_dummies(IPO_df, columns=["exchange","sector","industry","price_gain_loss"])
IPO_df_encoded.head()

Unnamed: 0,debt_asset_ratio,gross_profit_margin,net_profit_margin,exchange_AMEX,exchange_BATS,exchange_NASDAQ,exchange_NYSE,exchange_NYSE MKT,exchange_None,exchange_PINK,...,industry_Staffing & Employment Services,industry_Telecom Services,industry_Travel Services,industry_Trucking,industry_Utilities-Regulated Gas,industry_Utilities-Regulated Water,industry_Utilities-Renewable,industry_Waste Management,price_gain_loss_Gain,price_gain_loss_Loss
15,0.053702,0.050094,-0.573314,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
16,0.658664,0.079754,-0.055992,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0.0,0.045998,-0.577232,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
39,0.0,0.213307,0.13804,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
46,0.160104,0.31593,-0.037718,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Target Feature

In [32]:
y = IPO_df_encoded.price_gain_loss_Gain
X = IPO_df_encoded.drop(columns=["price_gain_loss_Gain","price_gain_loss_Loss"])

In [33]:
y

15      0
16      0
19      1
39      1
46      0
       ..
2680    0
2722    1
2907    1
2932    0
3020    1
Name: price_gain_loss_Gain, Length: 517, dtype: uint8

In [34]:
X

Unnamed: 0,debt_asset_ratio,gross_profit_margin,net_profit_margin,exchange_AMEX,exchange_BATS,exchange_NASDAQ,exchange_NYSE,exchange_NYSE MKT,exchange_None,exchange_PINK,...,industry_Specialty Industrial Machinery,industry_Specialty Retail,industry_Staffing & Employment Services,industry_Telecom Services,industry_Travel Services,industry_Trucking,industry_Utilities-Regulated Gas,industry_Utilities-Regulated Water,industry_Utilities-Renewable,industry_Waste Management
15,0.053702,0.050094,-0.573314,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0.658664,0.079754,-0.055992,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0.000000,0.045998,-0.577232,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39,0.000000,0.213307,0.138040,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,0.160104,0.315930,-0.037718,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2680,0.000000,0.000000,0.108949,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2722,0.000000,0.000000,0.000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2907,0.000000,0.000000,0.168834,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2932,0.000000,0.000000,0.150321,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
import numpy as np

X.describe()

Unnamed: 0,debt_asset_ratio,gross_profit_margin,net_profit_margin,exchange_AMEX,exchange_BATS,exchange_NASDAQ,exchange_NYSE,exchange_NYSE MKT,exchange_None,exchange_PINK,...,industry_Specialty Industrial Machinery,industry_Specialty Retail,industry_Staffing & Employment Services,industry_Telecom Services,industry_Travel Services,industry_Trucking,industry_Utilities-Regulated Gas,industry_Utilities-Regulated Water,industry_Utilities-Renewable,industry_Waste Management
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,...,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,0.176888,,-inf,0.001934,0.001934,0.545455,0.44294,0.003868,0.001934,0.001934,...,0.003868,0.005803,0.001934,0.003868,0.001934,0.001934,0.001934,0.001934,0.001934,0.005803
std,1.229985,,,0.04398,0.04398,0.498412,0.497215,0.062137,0.04398,0.04398,...,0.062137,0.076028,0.04398,0.062137,0.04398,0.04398,0.04398,0.04398,0.04398,0.076028
min,-0.004233,-inf,-inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-0.265776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.313444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.016516,0.719412,0.073274,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,25.93292,inf,68.089193,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split Training and Testing sets

In [36]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: Input contains infinity or a value too large for dtype('float64').

# Logistic Regression
our basic logistic regression parameters, which include:

- The solver parameter is set to 'lbfgs', which is an algorithm for learning and optimization. The particular solver isn't very important in this example, but note that a number of optimizers exist.
- The max_iter parameter will be set to 200 iterations, which will give the model sufficient opportunity to converge on effective weights

In [92]:
from sklearn.linear_model import LogisticRegression

# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.554


# Random Forest

In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.570


# SVM

In [84]:
from sklearn.svm import SVC

# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.438


# Deep Learning

In [85]:
len(X_train_scaled[0])

85

In [86]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  number_input_features*2
hidden_nodes_layer2 = number_input_features

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [87]:
# Train the model 
fit_model = nn.fit(X_train_scaled, y_train, epochs=100) 
# Evaluate the model using the test data 
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78