In [2]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


# Read data from csv file
file_path = '../data/lyric.csv'
df = pd.read_csv(file_path)

# Lyrics data columns
lyrics_column = 'lyric'  

# Load the Vietnamese stopwords
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())

def clean_text(text):
    # 1. Delete 
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace by whitespace
    text = re.sub(r'\d+', ' ', text)  # Delete numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Delete whitespaces

    # 2. Lowercase
    text = text.lower()

    # 3. Delete stopword
    if isinstance(text, str):  # Ensure text is a string
        words = text.split()
        cleaned_text = ' '.join([word for word in words if word not in stop_words])
        return cleaned_text
    return ' '.join(words)

# Apply the process function to the data
df[lyrics_column] = df[lyrics_column].astype(str).apply(clean_text)
df.columns = ['index','name_song','lyric','label']
df.drop(['index'], axis=1,inplace=True)
# Export processed data
processed_path = '../data/processed_data.csv'
df.to_csv(processed_path, index = False)

# Vector TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_tfidf = tfidf_vectorizer.fit_transform(df['lyric'])

# Save TF-IDF vectorizer
tfidf_vectorizer_path = '../model/tfidf_vectorizer.pkl'
joblib.dump(tfidf_vectorizer, tfidf_vectorizer_path)
print(f"TF-IDF Vectorizer đã được lưu tại: {tfidf_vectorizer_path}")

# Print TF-IDF shape
print("Shape of TF-IDF matrix:", X_tfidf.shape)

# LSA (SVD)
num_topics = 25  # number of topics
lsa = TruncatedSVD(n_components=num_topics, random_state=42)
X_lsa = lsa.fit_transform(X_tfidf)

# save the lsa model
lsa_model_path = '../model/lsa_model.pkl'
joblib.dump(lsa, lsa_model_path)
print(f"LSA Model đã được lưu tại: {lsa_model_path}")

print("Shape of LSA-transformed matrix:", X_lsa.shape)
print("Explained variance ratio:", lsa.explained_variance_ratio_.sum())

# Apply TF-IDF and LSA to DataFrame
df_tfidf_lsa = pd.DataFrame(X_lsa, columns=[f"Topic{i+1}" for i in range(num_topics)])
df_tfidf_lsa['label'] = df['label']

# Save the results
output_path = '../data/tfidf_lsa_lyric.csv'
df_tfidf_lsa.to_csv(output_path, index=False)
print(f"Dữ liệu TF-IDF và LSA đã được lưu tại: {output_path}")

TF-IDF Vectorizer đã được lưu tại: tfidf_vectorizer.pkl
Shape of TF-IDF matrix: (1101, 5000)
LSA Model đã được lưu tại: lsa_model.pkl
Shape of LSA-transformed matrix: (1101, 25)
Explained variance ratio: 0.1834282961418222
Dữ liệu TF-IDF và LSA đã được lưu tại: tfidf_lsa_lyric.csv


In [32]:
import numpy as np
import pandas as pd

def stratified_train_test_split(data, label_col, test_size=0.2, random_state=None):
    # Check if the label column exists
    if label_col not in data.columns:
        raise ValueError(f"Column '{label_col}' does not exist in the data.")
    
    # Set random seed if needed
    if random_state is not None:
        np.random.seed(random_state)
    
    # Get the list of labels and their proportions
    label_counts = data[label_col].value_counts(normalize=True)
    
    # Create a list of indices for the dataset
    indices = data.index.tolist()
    
    # Initialize lists for the train and test indices
    train_indices = []
    test_indices = []
    
    # Split the data based on label proportions
    for label, proportion in label_counts.items():
        # Get the indices of samples with the corresponding label
        label_indices = data[data[label_col] == label].index.tolist()
        
        # Determine the number of samples required for the test set
        test_count = int(len(label_indices) * test_size)
        
        # Shuffle the indices of this label
        np.random.shuffle(label_indices)
        
        # Split into train and test sets
        test_indices += label_indices[:test_count]
        train_indices += label_indices[test_count:]
    
    # Get the train and test datasets
    train_data = data.loc[train_indices]
    test_data = data.loc[test_indices]
    
    # Save the datasets to CSV files
    train_data.to_csv('../data/train_data.csv', index=False)
    test_data.to_csv('../data/test_data.csv', index=False)
    
    return train_data, test_data

# Read data from a CSV file
file_path = '../data/tfidf_lsa_lyric.csv'  # File containing the input data

try:
    df_tfidf_lsa = pd.read_csv(file_path)
except FileNotFoundError:
    print("File does not exist. Please check the path.")
    exit()

# Split the data into train and test
train_df, test_df = stratified_train_test_split(
    data=df_tfidf_lsa,
    label_col='label',  # Ensure the 'label' column exists
    test_size=0.2,
    random_state=48
)


In [34]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Read training and test data
data_train = pd.read_csv('../data/train_data.csv')
data_test = pd.read_csv('../data/test_data.csv')
X_train = data_train.drop(columns=['label'])
X_test = data_test.drop(columns=['label'])

y_train = data_train['label']
y_test = data_test['label']

# Set parameters for GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the RandomForestClassifier model
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

# Train the model with grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best accuracy score:", grid_search.best_score_)

# Make predictions on the test set using the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the best model if needed
import joblib
model_path = '../model/best_randomforest_model.pkl'
joblib.dump(best_rf, model_path)
print(f"The best model has been saved at: {model_path}")


Fitting 3 folds for each of 648 candidates, totalling 1944 fits
Best parameters found: {'bootstrap': True, 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best accuracy score: 0.6326530612244898
Test Accuracy: 0.6986301369863014
Classification Report:
              precision    recall  f1-score   support

        buồn       0.73      0.80      0.76        83
         vui       0.69      0.74      0.71        61
  yêu thương       0.67      0.56      0.61        75

    accuracy                           0.70       219
   macro avg       0.69      0.70      0.69       219
weighted avg       0.70      0.70      0.69       219

Mô hình tốt nhất đã được lưu tại: best_randomforest_model.pkl


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Read training and test data
data_train = pd.read_csv('../data/train_data.csv')
data_test = pd.read_csv('../data/test_data.csv')
X_train = data_train.drop(columns=['label'])
X_test = data_test.drop(columns=['label'])

y_train = data_train['label']
y_test = data_test['label']

# Train the Random Forest model
rf_model = RandomForestClassifier(bootstrap=True, max_depth=10, min_samples_leaf=4, 
                                  min_samples_split=10, n_estimators=200, random_state=44)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the model if needed
import joblib
model_path = '../model/random_forest_model.pkl'
joblib.dump(rf_model, model_path)
print(f"The model has been saved at: {model_path}")


Accuracy: 0.730593607305936
Classification Report:
              precision    recall  f1-score   support

        buồn       0.73      0.86      0.79        83
         vui       0.72      0.75      0.74        61
  yêu thương       0.74      0.57      0.65        75

    accuracy                           0.73       219
   macro avg       0.73      0.73      0.72       219
weighted avg       0.73      0.73      0.73       219

Mô hình đã được lưu tại: random_forest_model.pkl


In [67]:
import joblib
import pandas as pd
import re


# Load the Vietnamese stopwords
with open('vietnamese-stopwords.txt', 'r', encoding='utf-8') as file:
    stop_words = set(file.read().splitlines())
def clean_text(text):
    # 1. Loại bỏ ký tự đặc biệt
    text = re.sub(r'[^\w\s]', ' ', text)  # Thay thế ký tự đặc biệt bằng khoảng trắng
    text = re.sub(r'\d+', ' ', text)  # Loại bỏ chữ số
    text = re.sub(r'\s+', ' ', text).strip()  # Loại bỏ khoảng trắng thừa

    # 2. Chuyển về chữ thường
    text = text.lower()

    # 3. Loại bỏ stopword
    if isinstance(text, str):  # Ensure text is a string
        words = text.split()
        cleaned_text = ' '.join([word for word in words if word not in stop_words])
        return cleaned_text
    return ' '.join(words)
model_path = '../model/random_forest_model.pkl'
best_rf = joblib.load(model_path)
print("Mô hình đã được tải thành công.")

    
df_test = pd.DataFrame([[1001,'Buồn','''anh hẹn em pickleball''']], columns=['index','name_song', 'lyric'])

df_test['lyric'] = df_test['lyric'].astype(str).apply(clean_text)
df_test.columns = ['index','name_song','lyric']
df_test.head()


# Chuyển văn bản thành vector TF-IDF
tfidf_vectorizer = joblib.load('../model/fidf_vectorizer.pkl')

new_tfidf = tfidf_vectorizer.transform(df_test['lyric'])

# In thông tin TF-IDF
print("Shape of TF-IDF matrix:", new_tfidf.shape)

# Giảm chiều dữ liệu bằng LSA (SVD)
lda_model = joblib.load('../model/lsa_model.pkl')

new_lda = lda_model.transform(new_tfidf)
print(new_lda.shape)

predicted_label = best_rf.predict(new_lda)

print(f"Cảm xúc dự đoán của bài hát :", predicted_label[0])


Mô hình đã được tải thành công.
Shape of TF-IDF matrix: (1, 5000)
(1, 25)
Cảm xúc dự đoán của bài hát : vui


