In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load and Preprocess Data
# Assume you have a CSV file with columns: 'Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic', 'Interested'
# Load dataset
df = pd.read_csv("lyrics_data_music.csv")

# Drop rows with any missing values in the important columns
df = df.dropna(subset=['Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic'])

# Remove stopwords from lyrics
df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['Interested'] = df['Genre'].apply(lambda x: 'Yes' if x in ['Rock', 'country'] else 'No')
# 2. Split Data
X = df['Lyrics']
y = df['Interested']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.0, max_df=1.0, max_features=1000, lowercase=True)
vectorizer.fit(x_train)
x_train = vectorizer.transform(x_train)
x_test = vectorizer.transform(x_test)


X_train = x_train.astype('float32')
y_train = y_train.replace({'No': 0, 'Yes': 1}).astype('float32')
X_test = x_test.astype('float32')
y_test = y_test.replace({'No': 0, 'Yes': 1}).astype('float32')

from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Latent Semantic Analysis (LSA) for Topic Reduction with Logistic regression

In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression

# Dimensionality reduction with LSA
lsa = TruncatedSVD(n_components=50, random_state=42)
X_train_reduced = lsa.fit_transform(X_train)
X_test_reduced = lsa.transform(X_test)

# Train a Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train_reduced, y_train)

# Evaluate the model
y_pred = logreg.predict(X_test_reduced)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8
              precision    recall  f1-score   support

         0.0       0.81      0.99      0.89      4552
         1.0       0.44      0.04      0.07      1123

    accuracy                           0.80      5675
   macro avg       0.62      0.51      0.48      5675
weighted avg       0.73      0.80      0.73      5675



In [3]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


### Gradient Boosting for Classification

In [4]:
from xgboost import XGBClassifier

# Train XGBoost Classifier
xgb = XGBClassifier(random_state=42)
xgb.fit(X_train, y_train)

# Predict and evaluate
y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8207929515418502
              precision    recall  f1-score   support

         0.0       0.83      0.98      0.90      4552
         1.0       0.68      0.18      0.28      1123

    accuracy                           0.82      5675
   macro avg       0.75      0.58      0.59      5675
weighted avg       0.80      0.82      0.78      5675



### LSTM

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Download NLTK stopwords if not already downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load and Preprocess Data
# Assume you have a CSV file with columns: 'Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic', 'Interested'
df = pd.read_csv("lyrics_data_music.csv")

# Drop rows with any missing values in the important columns
df = df.dropna(subset=['Artist', 'Song Title', 'Release Year', 'Genre', 'Lyrics', 'Topic'])

# Remove stopwords from lyrics
df['Lyrics'] = df['Lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
df['Interested'] = df['Genre'].apply(lambda x: 'Yes' if x in ['Rock', 'country'] else 'No')

# Split Data
X = df['Lyrics']
y = df['Interested'].replace({'No': 0, 'Yes': 1})  # Convert labels to binary
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Tokenization and Padding for LSTM
tokenizer = Tokenizer(num_words=5000)  # Consider top 5000 words
tokenizer.fit_on_texts(x_train)

x_train_seq = pad_sequences(tokenizer.texts_to_sequences(x_train), maxlen=100)
x_test_seq = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=100)

# 3. Build and Compile LSTM Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),  # Embedding layer
    LSTM(128, return_sequences=False),  # LSTM layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 4. Train the Model
model.fit(x_train_seq, y_train, epochs=5, batch_size=32, validation_data=(x_test_seq, y_test))

# 5. Evaluate the Model
loss, accuracy = model.evaluate(x_test_seq, y_test)
print("LSTM Accuracy:", accuracy)

# Optional: Classification Report
y_pred = (model.predict(x_test_seq) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5




[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 176ms/step - accuracy: 0.8090 - loss: 0.4706 - val_accuracy: 0.8021 - val_loss: 0.4410
Epoch 2/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 168ms/step - accuracy: 0.8210 - loss: 0.3813 - val_accuracy: 0.8062 - val_loss: 0.4415
Epoch 3/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 152ms/step - accuracy: 0.8466 - loss: 0.3370 - val_accuracy: 0.7930 - val_loss: 0.4576
Epoch 4/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 145ms/step - accuracy: 0.8750 - loss: 0.2856 - val_accuracy: 0.8028 - val_loss: 0.4760
Epoch 5/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 143ms/step - accuracy: 0.9043 - loss: 0.2384 - val_accuracy: 0.8016 - val_loss: 0.5337
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 58ms/step - accuracy: 0.8010 - loss: 0.5309
LSTM Accuracy: 0.8015859127044678
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━

### Autoencoders for Feature Extraction

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the Autoencoder
autoencoder = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),  # Bottleneck layer
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(X_train.shape[1], activation='sigmoid')
])

autoencoder.compile(optimizer='adam', loss='mse')

# Train the Autoencoder
autoencoder.fit(X_train.toarray(), X_train.toarray(), epochs=5, batch_size=32, validation_data=(X_test.toarray(), X_test.toarray()))

# Extract features from the bottleneck layer
encoder = Sequential(autoencoder.layers[:3])
X_train_encoded = encoder.predict(X_train.toarray())
X_test_encoded = encoder.predict(X_test.toarray())

# Train a classifier on the encoded features
logreg = LogisticRegression()
logreg.fit(X_train_encoded, y_train)
y_pred = logreg.predict(X_test_encoded)
print("Accuracy:", accuracy_score(y_test, y_pred))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 24ms/step - loss: 1.0231 - val_loss: 1.1115
Epoch 2/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 1.0079 - val_loss: 1.1115
Epoch 3/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 1.0147 - val_loss: 1.1117
Epoch 4/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 1.0029 - val_loss: 1.0800
Epoch 5/5
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - loss: 0.9855 - val_loss: 1.0690
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step
[1m178/178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Accuracy: 0.8001762114537445


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
