<a href="https://colab.research.google.com/github/scoopcash/jan10-Deep-Neural-Networks/blob/main/Colab_multiclass_in_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiclass Prediction in Deep Learning Models

## Read in the Data

In [1]:
# Import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation


%matplotlib inline

In [2]:
# Upload in the data
from google.colab import files
uploaded = files.upload()

Saving meet_or_beat.csv to meet_or_beat (1).csv


In [3]:
# Read the data into a Dataframe
df = pd.read_csv('meet_or_beat.csv')

In [4]:
# Check dataset size
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71968 entries, 0 to 71967
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   EPS                   71968 non-null  float64
 1   forecasted_eps        71968 non-null  float64
 2   noOfEsts              71968 non-null  float64
 3   after_total_returns   71968 non-null  float64
 4   before_total_returns  71968 non-null  float64
dtypes: float64(5)
memory usage: 2.7 MB


In [5]:
# Preview dataframe
df.head()

Unnamed: 0,EPS,forecasted_eps,noOfEsts,after_total_returns,before_total_returns
0,2.01,1.67,11.0,0.051444,0.018585
1,0.17,0.19,6.0,0.112955,-0.00051
2,-0.07,0.14,4.0,0.077167,-0.046104
3,0.48,0.51,8.0,-0.00613,-0.004899
4,-0.24,-0.27,9.0,0.089762,-0.025466


## Preparing the Data

In [6]:
# Generate the categorical outcome variable
df['earnings_outcome'] = np.nan
df.loc[(df['EPS']== df['forecasted_eps']), 'earnings_outcome']='meet'
df.loc[(df['EPS'] > df['forecasted_eps']), 'earnings_outcome']='beat'
df.loc[(df['EPS'] < df['forecasted_eps']), 'earnings_outcome']='lose'

In [7]:
# Preview the output variable 
y = df['earnings_outcome']
y

0        beat
1        lose
2        lose
3        lose
4        beat
         ... 
71963    beat
71964    beat
71965    lose
71966    beat
71967    meet
Name: earnings_outcome, Length: 71968, dtype: object

In [8]:
# Encode earnings labels to integers
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder().fit(y)

encoded_y = encoder.transform(y)

encoded_y

array([0, 1, 1, ..., 1, 0, 2])

In [9]:
# Save the unique number of labels for future use
number_classes = len(list(y.drop_duplicates()))
number_classes

3

In [10]:
# Convert labeled integers to a Keras `categorical` data type
from keras.utils.np_utils import to_categorical

y_categorical = to_categorical(encoded_y , num_classes=number_classes)

In [11]:
# Specify X (predictor) variables
X=df[['forecasted_eps', 'before_total_returns', 'noOfEsts']]
X

Unnamed: 0,forecasted_eps,before_total_returns,noOfEsts
0,1.67,0.018585,11.0
1,0.19,-0.000510,6.0
2,0.14,-0.046104,4.0
3,0.51,-0.004899,8.0
4,-0.27,-0.025466,9.0
...,...,...,...
71963,0.30,0.016854,4.0
71964,-0.66,-0.039052,3.0
71965,0.28,0.141599,3.0
71966,0.10,0.026346,4.0


In [12]:
from sklearn.model_selection import train_test_split
# Split into training and testing windows
X_train, X_test, y_train, y_test = train_test_split(X,y_categorical, random_state=1)

In [13]:
# Preview the encoded data we're trying to predict


In [14]:
# Check for class balance
pd.DataFrame(y_train).sum()

0    32924.0
1    16878.0
2     4174.0
dtype: float32

In [15]:
# Save the count of unique predictor variables for use in model
#these are just the inputs
number_predictor_varibles = len(X.columns)
number_predictor_varibles

3

## Building, Fitting and Predicting with Multiple Classes

In [16]:
# Build the neural network layers
model = Sequential()
model.add(
    Dense(units =9, input_dim=number_predictor_varibles, activation='relu')
)
model.add(
    Dense(units =6, activation='relu')
)

In [17]:
# Add the final output layer
# With the # of possible outputs equal to the number classes 
model.add(
    Dense(number_classes, activation='softmax')
)

In [18]:
# Compile the model (with multi-class specific parameters)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [19]:
# Summarise the structure of the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 9)                 36        
                                                                 
 dense_1 (Dense)             (None, 6)                 60        
                                                                 
 dense_2 (Dense)             (None, 3)                 21        
                                                                 
Total params: 117
Trainable params: 117
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Fit the model
model.fit(X_train, y_train, epochs=20, batch_size=1000, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe5931b0700>

In [21]:
# Evaluate model on the test data
model.evaluate(X_test, y_test)



[0.8387475609779358, 0.6142730116844177]

In [22]:
# Save predictions on the test data
predictions = model.predict(X_test)
predictions



array([[0.57413596, 0.3374874 , 0.08837657],
       [0.7590251 , 0.17012441, 0.07085055],
       [0.6024984 , 0.309841  , 0.08766064],
       ...,
       [0.6555149 , 0.26915267, 0.07533241],
       [0.5701543 , 0.34655383, 0.08329184],
       [0.5125253 , 0.41276538, 0.07470932]], dtype=float32)

In [23]:
# Get the most likely prediction for each observation
most_likely =np.argmax(predictions, axis=1)
most_likely

array([0, 0, 0, ..., 0, 0, 0])

In [24]:
# Convert most likely category back to original labels
most_likely = encoder.inverse_transform(most_likely)

In [26]:
# Evaluate prediction balance
pd.DataFrame(most_likely).value_counts()

beat    17733
lose      259
dtype: int64