In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load and Preprocess Data
# Assume you have a CSV file with columns: 'Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic', 'Interested'
# Load dataset
df = pd.read_csv("lyrics_data_music.csv")

# Drop rows with any missing values in the important columns
df = df.dropna(subset=['Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic'])

# Remove stopwords from lyrics
df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['Interested'] = df['Genre'].apply(lambda x: 'Yes' if x in ['pop', 'country', 'blues', 'jazz'] else 'No')
# 2. Split Data
X = df['Lyrics']
y = df['Interested']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, max_df=1.0, max_features=1000, lowercase=True)
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)


X_train = x_train.astype('float32')
y_train = y_train.replace({'No': 0, 'Yes': 1}).astype('float32')
X_test = x_test.astype('float32')
y_test = y_test.replace({'No': 0, 'Yes': 1}).astype('float32')

from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Latent Semantic Analysis (LSA) for Topic Reduction with Logistic regression

In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

# Dimensionality reduction with LSA
lsa = TruncatedSVD(n_components=50, random_state=42)
X_train_reduced = lsa.fit_transform(X_train)
X_test_reduced = lsa.transform(X_test)

# Train a Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_reduced, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test_reduced)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7682819383259912
              precision    recall  f1-score   support

         0.0       0.67      0.24      0.36      1497
         1.0       0.78      0.96      0.86      4178

    accuracy                           0.77      5675
   macro avg       0.72      0.60      0.61      5675
weighted avg       0.75      0.77      0.73      5675



In [3]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


### Gradient Boosting for Classification

In [4]:
from xgboost import XGBClassifier

# Train XGBoost Classifier
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7707488986784141
              precision    recall  f1-score   support

         0.0       0.65      0.28      0.39      1497
         1.0       0.79      0.95      0.86      4178

    accuracy                           0.77      5675
   macro avg       0.72      0.61      0.63      5675
weighted avg       0.75      0.77      0.74      5675



### LSTM

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load and Preprocess Data
# Assume you have a CSV file with columns: 'Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic', 'Interested'
df = pd.read_csv("lyrics_data_music.csv")

# Drop rows with any missing values in the important columns
df = df.dropna(subset=['Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic'])

# Remove stopwords from lyrics
df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['Interested'] = df['Genre'].apply(lambda x: 'Yes' if x in ['pop', 'country','blues','jazz'] else 'No')

# Split Data
X = df['Lyrics']
y = df['Interested'].replace({'No': 0, 'Yes': 1})  # Convert labels to binary
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Tokenization and Padding for LSTM
tokenizer = Tokenizer(num_words=5000)  # Consider top 5000 words
tokenizer.fit_on_texts(x_train)

x_train_seq = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=100)
x_test_seq = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=100)

# 3. Build and Compile LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),  # Embedding layer
    LSTM(128, return_sequences=False),  # LSTM layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 4. Train the Model
model.fit(x_train_seq, y_train, epochs=5, batch_size=32, validation_data=(x_test_seq, y_test))

# 5. Evaluate the Model
loss, accuracy = model.evaluate(x_test_seq, y_test)
print("LSTM Accuracy:", accuracy)

# Optional: Classification Report
y_pred = (model.predict(x_test_seq) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 143ms/step - accuracy: 0.7564 - loss: 0.5415 - val_accuracy: 0.7667 - val_loss: 0.5123
Epoch 2/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 153ms/step - accuracy: 0.8030 - loss: 0.4516 - val_accuracy: 0.7669 - val_loss: 0.5158
Epoch 3/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 176ms/step - accuracy: 0.8254 - loss: 0.4141 - val_accuracy: 0.7568 - val_loss: 0.5446
Epoch 4/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 155ms/step - accuracy: 0.8564 - loss: 0.3492 - val_accuracy: 0.7413 - val_loss: 0.5902
Epoch 5/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 161ms/step - accuracy: 0.8844 - loss: 0.2911 - val_accuracy: 0.7464 - val_loss: 0.6768
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.7524 - loss: 0.6743
LSTM Accuracy: 0.7464317083358765
[1m178/178[0m [32m━━━━━━━━━

### Autoencoders for Feature Extraction

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the Autoencoder
autoencoder = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),  # Bottleneck layer
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(X_train.shape[1], activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

# Train the Autoencoder
autoencoder.fit(X_train.toarray(), X_train.toarray(), epochs=5, batch_size=32, validation_data=(X_test.toarray(), X_test.toarray()))

# Extract features from the bottleneck layer
encoder = Sequential(autoencoder.layers[:3])
X_train_encoded = encoder.predict(X_train.toarray())
X_test_encoded = encoder.predict(X_test.toarray())

# Train a classifier on the encoded features
logreg = LogisticRegression()
logreg.fit(X_train_encoded, y_train)
y_pred = logreg.predict(X_test_encoded)
print("Accuracy:", accuracy_score(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 31ms/step - loss: 1.0332 - val_loss: 1.0792
Epoch 2/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 19ms/step - loss: 0.9702 - val_loss: 1.0578
Epoch 3/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - loss: 0.9553 - val_loss: 1.0517
Epoch 4/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - loss: 0.9327 - val_loss: 1.0490
Epoch 5/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 19ms/step - loss: 0.9491 - val_loss: 1.0472
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 0.7429074889867842


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
