## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn

## Extracting Data and Removing Correlated Variables

In [2]:
name_list = ['id','avg_len','tot_len','avg_rate','tot_rate','review','rating','completion','min_listened','support_req','lv','target']
data = pd.read_csv('Audiobooks_data.csv',header = None, names = name_list)
data.drop('id',axis=1,inplace = True)

In [3]:
data['units_purchased'] = (data['tot_len']/data['avg_len']).astype(int)

In [4]:
data.drop(['avg_len','avg_rate','min_listened'],axis=1,inplace =True)

In [5]:
from sklearn.utils import shuffle
data = shuffle(data).reset_index().drop('index',axis=1)
data

Unnamed: 0,tot_len,tot_rate,review,rating,completion,support_req,lv,target,units_purchased
0,2160,5.33,0,8.91,0.00,0,168,0,1
1,1188,5.33,0,8.91,0.00,0,0,0,1
2,2160,8.00,1,9.00,0.16,0,199,0,1
3,4428,16.00,0,8.91,0.00,0,0,1,3
4,1620,10.13,0,8.91,0.00,0,0,1,1
...,...,...,...,...,...,...,...,...,...
14079,2160,5.33,0,8.91,0.00,0,0,0,1
14080,2160,7.99,0,8.91,0.00,0,0,1,1
14081,2160,6.13,0,8.91,0.26,0,192,0,1
14082,1620,5.61,0,8.91,0.52,0,60,0,1


## Dataset Balancing

In [6]:
ones = np.sum(data['target'])
zeros = 0
removal_index = []
for i,r in data.iterrows():
    if r['target'] == 0:
        zeros += 1
        if zeros > ones:
            removal_index.append(i)

data.drop(data.index[removal_index], inplace = True)
data.describe()

Unnamed: 0,tot_len,tot_rate,review,rating,completion,support_req,lv,target,units_purchased
count,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0
mean,1833.127403,8.509611,0.164953,8.909725,0.075311,0.079124,70.434287,0.5,1.200268
std,874.874972,6.570098,0.37118,0.714516,0.197712,0.404532,92.28937,0.500056,0.561162
min,216.0,3.86,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,1188.0,5.33,0.0,8.91,0.0,0.0,0.0,0.0,1.0
50%,1620.0,6.735,0.0,8.91,0.0,0.0,21.0,0.5,1.0
75%,2160.0,8.61,0.0,8.91,0.0,0.0,127.0,1.0,1.0
max,7020.0,111.47,1.0,10.0,1.0,8.0,374.0,1.0,6.0


## Feature Scaling and splitting into Train-Test-Validate

In [7]:
data = shuffle(data).reset_index().drop('index',axis=1)

In [8]:
from mlxtend.preprocessing import standardize
feature_list = ['tot_len','tot_rate','review','rating','completion','support_req','lv','units_purchased']
targets = data['target']
features = standardize(data, columns = feature_list)
features.describe()

Unnamed: 0,tot_len,tot_rate,review,rating,completion,support_req,lv,units_purchased
count,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0,4474.0
mean,1.661985e-16,-5.021443e-14,2.787717e-16,-5.559769e-13,6.883719e-16,3.876227e-16,-1.040989e-16,2.792928e-16
std,1.000112,1.000112,1.000112,1.000112,1.000112,1.000112,1.000112,1.000112
min,-1.848616,-0.707772,-0.4444519,-11.07129,-0.380954,-0.1956152,-0.7632749,-0.3569212
25%,-0.7374762,-0.484006,-0.4444519,0.0003848095,-0.380954,-0.1956152,-0.7632749,-0.3569212
50%,-0.2436362,-0.2701344,-0.4444519,0.0003848095,-0.380954,-0.1956152,-0.5357043,-0.3569212
75%,0.3736638,0.01528138,-0.4444519,0.0003848095,-0.380954,-0.1956152,0.6129854,-0.3569212
max,5.929364,15.67281,2.249962,1.526064,4.677478,19.58252,3.289649,8.554159


In [9]:
test_val_size = int(0.1* features.shape[0])
train_size = features.shape[0] - 2 * test_val_size


test_x, test_y = features[0 : test_val_size], targets[0 : test_val_size]
val_x, val_y = features[test_val_size : 2*test_val_size], targets[test_val_size : 2*test_val_size]
train_x, train_y = features[2*test_val_size :],targets[2*test_val_size :]

# Building and Testing Model


In [16]:
input_size = 8
output_size = 2
hidden_layer_size = 50

batch_size = 100

max_epochs = 100

model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    
    tf.keras.layers.Dense(output_size, activation='softmax') 
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_x, 
          train_y, 
          batch_size=batch_size, 
          epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(val_x, val_y),
          verbose = 2
          )  


Epoch 1/100
36/36 - 0s - loss: 0.5910 - accuracy: 0.6899 - val_loss: 0.5011 - val_accuracy: 0.7740
Epoch 2/100
36/36 - 0s - loss: 0.4837 - accuracy: 0.7595 - val_loss: 0.4517 - val_accuracy: 0.7785
Epoch 3/100
36/36 - 0s - loss: 0.4341 - accuracy: 0.7818 - val_loss: 0.4159 - val_accuracy: 0.8031
Epoch 4/100
36/36 - 0s - loss: 0.4088 - accuracy: 0.7891 - val_loss: 0.3993 - val_accuracy: 0.8076
Epoch 5/100
36/36 - 0s - loss: 0.3963 - accuracy: 0.7908 - val_loss: 0.3904 - val_accuracy: 0.8121
Epoch 6/100
36/36 - 0s - loss: 0.3876 - accuracy: 0.7986 - val_loss: 0.3824 - val_accuracy: 0.8076
Epoch 7/100
36/36 - 0s - loss: 0.3797 - accuracy: 0.8014 - val_loss: 0.3833 - val_accuracy: 0.8143
Epoch 8/100
36/36 - 0s - loss: 0.3729 - accuracy: 0.8089 - val_loss: 0.3726 - val_accuracy: 0.8121
Epoch 9/100
36/36 - 0s - loss: 0.3685 - accuracy: 0.8025 - val_loss: 0.3680 - val_accuracy: 0.8121
Epoch 10/100
36/36 - 0s - loss: 0.3635 - accuracy: 0.8089 - val_loss: 0.3709 - val_accuracy: 0.8098
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x1be7726c0d0>

In [17]:
model.evaluate(test_x, test_y)



[0.3768601417541504, 0.8031319975852966]