In [1]:
import numpy as np
import pandas as pd
import os, re, time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
from google.cloud import storage
import os
import io
client = storage.Client()
bucket = client.get_bucket('nlp_final_data')

blob = bucket.blob('songs_nGrams.csv')
content = blob.download_as_string()

df = pd.read_csv(io.BytesIO(content))

In [3]:
df['1-grams'] = df['1-grams'].apply(lambda x: x.strip('[]').replace('\'', '').split(', '))

In [4]:
df['songs'] = df['1-grams'].apply(lambda x: ' '.join(x))

In [5]:
textVar = df['songs']
targetVar = df['tag']

In [6]:
# make our experiments repeatable
np.random.seed(0)

# Model Hyperparameters
embedding_dim = 100

# Training parameters. Here we specify the training of the net will use 64 examples for each backprop iteration
batch_size = 128
# We will go thru the entire data set 20 times
num_epochs = 20

# Prepossessing parameters
# We will only input the first "sequence_length" words of every Yelp review (and pad out with nulls if the text is < "sequence_length
# We will also build our model using the most frequent 20000 words in our tweet 'dictionary'
sequence_length = 200
max_features = 2000

#Specify the number of classes to predict (1 for binary classification or count unique values for multilabel classification)
# num_classes = 1
num_classes = targetVar.nunique()

In [7]:
X = textVar.values
Y = pd.get_dummies(targetVar).values

In [8]:
train_samples, val_samples, train_labels, val_labels = train_test_split(X,Y, stratify=targetVar, test_size = 0.33, random_state = 1010)

In [9]:
vectorizer = TextVectorization(max_tokens=max_features, output_sequence_length=sequence_length, ngrams=None, pad_to_max_tokens=True)

text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

2023-05-19 19:50:58.850525: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-05-19 19:50:58.850574: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-05-19 19:50:58.850611: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (nlpfinal): /proc/driver/nvidia/version does not exist
2023-05-19 19:50:58.852161: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
X_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
X_test = vectorizer(np.array([[s] for s in val_samples])).numpy()

Y_train = np.array(train_labels)
Y_test = np.array(val_labels)

In [12]:
embeddings_index = {}

glove_dir = 'Data/'
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding="utf8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 41545 word vectors.


In [13]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [14]:
num_tokens = len(voc) + 2
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 1950 words (50 misses)


In [15]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [24]:
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
preds = layers.Dense((num_classes), activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

model.compile(
    loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         200200    
                                                                 
 conv1d_3 (Conv1D)           (None, None, 128)         64128     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, None, 128)        0         
 1D)                                                             
                                                                 
 conv1d_4 (Conv1D)           (None, None, 128)         82048     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, None, 128)        0         
 1D)                                                       

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

In [25]:
#! pip install graphviz
SVG(model_to_dot(model, show_shapes=True, dpi=65).create(prog='dot', format='svg'))

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


AttributeError: 'NoneType' object has no attribute 'create'

In [28]:
history = model.fit(X_train, Y_train,
                    batch_size=batch_size, 
                    epochs=10,
                    validation_data=(X_test, Y_test),
                    verbose=2)

Epoch 1/10
2268/2268 - 98s - loss: 1.1859 - acc: 0.5146 - val_loss: 1.1368 - val_acc: 0.5361 - 98s/epoch - 43ms/step
Epoch 2/10
2268/2268 - 95s - loss: 1.1193 - acc: 0.5448 - val_loss: 1.1322 - val_acc: 0.5381 - 95s/epoch - 42ms/step
Epoch 3/10
2268/2268 - 95s - loss: 1.0944 - acc: 0.5554 - val_loss: 1.0993 - val_acc: 0.5493 - 95s/epoch - 42ms/step
Epoch 4/10
2268/2268 - 95s - loss: 1.0782 - acc: 0.5633 - val_loss: 1.1089 - val_acc: 0.5509 - 95s/epoch - 42ms/step
Epoch 5/10
2268/2268 - 94s - loss: 1.0668 - acc: 0.5671 - val_loss: 1.0983 - val_acc: 0.5564 - 94s/epoch - 42ms/step
Epoch 6/10
2268/2268 - 95s - loss: 1.0582 - acc: 0.5711 - val_loss: 1.1045 - val_acc: 0.5554 - 95s/epoch - 42ms/step
Epoch 7/10
2268/2268 - 95s - loss: 1.0499 - acc: 0.5735 - val_loss: 1.1243 - val_acc: 0.5464 - 95s/epoch - 42ms/step
Epoch 8/10
2268/2268 - 94s - loss: 1.0443 - acc: 0.5761 - val_loss: 1.2799 - val_acc: 0.5063 - 94s/epoch - 42ms/step
Epoch 9/10
2268/2268 - 97s - loss: 1.0374 - acc: 0.5795 - val_lo

In [29]:
model.save('model_CNN_Test.h5')

In [30]:
client = storage.Client()

blob = bucket.blob('model_CNN_Test.h5')
blob.upload_from_filename('model_CNN_Test.h5')