In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import LabelEncoder
from keras.layers import LSTM,Dense,Dropout, Reshape, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.models import Sequential
from keras.layers import Embedding,SimpleRNN
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils

from tensorflow.keras.optimizers import Adam

from gensim.models import Word2Vec
import gensim

import ast
import nltk

In [None]:
from google.cloud import storage
import os
import io
client = storage.Client()
bucket = client.get_bucket('nlp_final_data')

blob = bucket.blob('songs_combined_ngrams.csv')
content = blob.download_as_string()

df = pd.read_csv(io.BytesIO(content))

In [None]:
df.columns

In [None]:
df['up_to_bigrams'] = df['up_to_bigrams'].apply(lambda x: x.strip('[]').replace('\'', '').split(', '))

In [None]:
X = df['up_to_bigrams']
y = df['tag']

In [None]:
# https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/

In [None]:
# split the balanced data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2,stratify=y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [None]:
tokenize = Tokenizer(char_level=False)
tokenize.fit_on_texts(X)

# we will need this later
num_words = len(tokenize.word_index)+1

In [None]:
#num_words is 496351 with just tokens, and not cleaned
#num_words is 46207 when data is lemmatized and cleaned
num_words

In [None]:
x_train = tokenize.texts_to_sequences(X_train)
x_test = tokenize.texts_to_sequences(X_test)

encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

num_classes = np.max(y_train) + 1
y_train = np_utils.to_categorical(y_train, num_classes)
y_test = np_utils.to_categorical(y_test, num_classes)

In [None]:
max_log_length = 1024
x_train = pad_sequences(x_train, maxlen=max_log_length)
x_test = pad_sequences(x_test, maxlen=max_log_length)

In [None]:
model3 = Sequential()

model3.add(Embedding(input_dim=num_words,output_dim=64,input_length=max_log_length))

num_filters = 128
kernel_sizes = [3,4]
for kernel_size in kernel_sizes:
    model3.add(Conv1D(num_filters, kernel_size, activation='relu'))
#model3.add(Conv1D(128, 3, activation='relu'))

model3.add(MaxPooling1D(pool_size=2))
model3.add(LSTM(units=64,recurrent_dropout=0.5))
model3.add(Dropout(0.5))

model3.add(Dense(128, activation='relu'))
model3.add(Dropout(0.5))
    
model3.add(Dense(num_classes,activation='softmax'))

# compile the model
model3.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model3.fit(x_train, y_train,
                    batch_size=128,
                    epochs=3,
                    verbose=1,
                    validation_split=0.1)

In [None]:
model3.save('model_bigrams.h5')

In [None]:
blob_name = 'model_bigrams.h5'
client = storage.Client()

blob = bucket.blob(blob_name)
blob.upload_from_filename(blob_name)

Using convolutional layers in a neural network for text data can be beneficial for several reasons:

Local Pattern Extraction: Convolutional layers can effectively capture local patterns and features in the text data. By applying filters of different sizes, the convolutional operation can detect patterns at various levels of granularity. This allows the model to learn relevant features such as n-grams, word combinations, or other local patterns that are indicative of the text's meaning or sentiment.

Translation Invariance: Convolutional layers exhibit translation invariance, which means they can recognize patterns regardless of their exact position in the input. In the context of text data, this property is useful because the position of a particular word or phrase in a sentence may not always be critical for understanding its meaning. By capturing patterns irrespective of their location, convolutional layers can provide robust representations that are not overly sensitive to word order.

Reduced Parameter Count: Convolutional layers can help reduce the number of parameters in the model compared to fully connected layers. This reduction is achieved by weight sharing through the use of filters. By sharing weights, the model can capture the same pattern or feature across different positions in the input, resulting in fewer trainable parameters. This parameter efficiency can make the model easier to train and less prone to overfitting, especially when dealing with limited amounts of text data.

Hierarchical Feature Learning: Deep architectures with multiple convolutional layers can learn hierarchical representations of the input text. Lower-level convolutional layers can capture basic local features, while higher-level convolutional layers can learn more complex combinations of these features. This hierarchical learning enables the model to capture both low-level and high-level semantic information from the text.

It's worth noting that while convolutional neural networks (CNNs) have primarily been associated with image processing tasks, they have been successfully adapted for natural language processing (NLP) tasks, including text classification. The convolutional operations in text CNNs are typically performed along the time dimension (i.e., word or character sequences) rather than across spatial dimensions (as in image CNNs).

That being said, the effectiveness of using convolutional layers in text classification tasks may vary depending on the specific dataset and problem. It's recommended to experiment with different architectures and compare the performance with other approaches, such as recurrent neural networks (RNNs) or transformers, to determine the best choice for your particular task.






Regenerate response