# Data Loading

In [41]:
import os
import codecs

train_happy = []
folder_path = 'Data/Training_Data/Happy_Songs/'
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            train_happy.append(f.read())
    except UnicodeDecodeError:
        print(f"Error decoding file: {file_path}")

train_sad = []
folder_path = 'Data/Training_Data/Sad_Songs/'
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            train_sad.append(f.read())
    except UnicodeDecodeError:
        print(f"Error decoding file: {file_path}")

test_happy = []
folder_path = 'Data/Test_Data/Happy_Songs/'
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            test_happy.append(f.read())
    except UnicodeDecodeError:
        print(f"Error decoding file: {file_path}")

test_sad = []
folder_path = 'Data/Test_Data/Sad_Songs/'
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            test_sad.append(f.read())
    except UnicodeDecodeError:
        print(f"Error decoding file: {file_path}")


# Data Preprocessing

In [17]:
import string
telugu_characters  = [chr(i) for i in range(ord(b'\\u0c00'.decode('unicode_escape')), ord(b'\\u0c7f'.decode('unicode_escape'))+1)]

def preprocess(songs_list):
    updated_songs_list = []
    for song in songs_list:
        processed_song = ''
        for ch in song:
            if(ch in telugu_characters):
                processed_song += ch
            elif(ch == ' ' or ch == '.' or ch==','):
                if(processed_song[-1]!=' '):
                    processed_song += ' '
        updated_songs_list.append(processed_song)
    return updated_songs_list
        
train_happy = [list(song.split(' ')) for song in preprocess(train_happy)]
train_sad = [list(song.split(' ')) for song in preprocess(train_sad)]

test_happy = [list(song.split(' ')) for song in preprocess(test_happy)]
test_sad = [list(song.split(' ')) for song in preprocess(test_sad)]

# Converting Songs to Vectors

In [43]:
import fasttext
import fasttext.util
ft = fasttext.load_model('indicnlp.ft.te.300.bin')
word = "మనసు"
print("Embedding Shape is {}".format(ft.get_word_vector(word).shape))
print("Nearest Neighbors to {} are:".format(word))
ft.get_nearest_neighbors(word)

Embedding Shape is (300,)
Nearest Neighbors to మనసు are:




[(0.7650183439254761, 'మనసును'),
 (0.7404142618179321, 'మనసుని'),
 (0.7011533379554749, 'మనసూ'),
 (0.6911419034004211, 'మనస్సు'),
 (0.6807750463485718, 'మనుసు'),
 (0.6777817010879517, 'మనసునే'),
 (0.6626439690589905, 'మనసులు'),
 (0.648022472858429, 'మనస్సును'),
 (0.6384133696556091, 'మనసుకి'),
 (0.6321133971214294, 'మనసు.')]

In [21]:
from gensim.models import FastText

# Load pre-trained model
pretrained_model_path = "indicnlp.ft.te.300.bin"
model = FastText.load_fasttext_format(pretrained_model_path)

training_data = train_happy + train_sad
# Fine-tune the model
model.build_vocab(corpus_iterable=training_data , update=True)
model.train(corpus_iterable=training_data , total_examples=len(training_data), epochs=10)

# Save the fine-tuned model
fine_tuned_model_path = "fine_tuned_indicnlp.ft.te.300.bin"
model.save(fine_tuned_model_path)

  model = FastText.load_fasttext_format(pretrained_model_path)


In [22]:
from gensim.models import FastText

# Loading the fine-tuned FastText model
fine_tuned_model_path = "fine_tuned_indicnlp.ft.te.300.bin"
model = FastText.load(fine_tuned_model_path)

import numpy as np

# Function to convert a song into a vector representation
def song_to_vector(song, model):
    word_vectors = []
    for word in song:
        embeddings = model.wv[word]
        word_vectors.append(embeddings)

    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        # If no words in the song are in the vocabulary, return zeros
        return np.zeros(model.vector_size)
    
# Example usage:
train_happy_vectors = [song_to_vector(song, model) for song in train_happy]
train_sad_vectors = [song_to_vector(song, model) for song in train_sad]
test_happy_vectors = [song_to_vector(song, model) for song in test_happy]
test_sad_vectors = [song_to_vector(song, model) for song in test_sad]

# Training & Predicting Using ML Models

In [23]:
training_Labels = [1 for i in train_happy] + [0 for i in train_sad]
testing_Labels = [1 for i in test_happy] + [0 for i in test_sad]

X_train = train_happy_vectors + train_sad_vectors
X_test = test_happy_vectors + test_sad_vectors
y_train = training_Labels
y_test = testing_Labels

## Naive Bayes

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

clf = GaussianNB()
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
accuracy = accuracy_score (y_test, predictions)
print(f"Accuracy: {accuracy: .2f}")
print("\nClassification Report:")
print(classification_report(testing_Labels, predictions))

Accuracy:  0.66

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.80      0.70        50
           1       0.72      0.52      0.60        50

    accuracy                           0.66       100
   macro avg       0.67      0.66      0.65       100
weighted avg       0.67      0.66      0.65       100



## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

classifier = LogisticRegression(max_iter=10000)
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

accuracy = accuracy_score (y_test, predictions)
print(f"Accuracy: {accuracy: .2f}")
print("\nClassification Report:")
print(classification_report(testing_Labels, predictions))

Accuracy:  0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        50
           1       0.75      0.66      0.70        50

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100



## Support Vector Classifier

In [37]:
from sklearn.svm import SVC

svm_classifier = SVC(kernel='linear', random_state=33)

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(testing_Labels, y_pred))

Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.78      0.74        50
           1       0.75      0.66      0.70        50

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100



## Decision Tree Classifier

In [38]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV

clf = DecisionTreeClassifier()

# Setup Parameter Grid
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'], 
    'max_depth': [2,4, 5, 6, 7,10 , 15 , 20, None],
    'min_samples_split': [2, 3, 5, 7, 9 , 12 , 15],
    'min_samples_leaf': [1, 2, 4 , 6 , 8 , 10]
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best Parameters and Best Estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)

# Evaluate on Test Set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = np.mean(y_test == y_pred) 
print("Accuracy on test set:", accuracy)
print("\nClassification Report:")
print(classification_report(testing_Labels, y_pred))

Fitting 5 folds for each of 1134 candidates, totalling 5670 fits
Best parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best estimator: DecisionTreeClassifier(max_depth=2)
Accuracy on test set: 0.64

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.54      0.60        50
           1       0.62      0.74      0.67        50

    accuracy                           0.64       100
   macro avg       0.65      0.64      0.64       100
weighted avg       0.65      0.64      0.64       100



## Random Forest Classifier

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

# Setup Parameter Grid
param_grid = {
    'n_estimators': [20 , 30 , 40 ,50, 100, 200], # Number of trees in the forest
    'max_depth': [3,4, 5,6, 7, 8 , 9,  None], 
    'min_samples_split': [2, 3, 5 , 7 , 9 , 10 , 15],
    'min_samples_leaf': [1, 2, 3, 4, 5 , 6 ],
    'bootstrap': [True, False] # Whether to use bootstrap sampling
}

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best Parameters and Best Estimator
print("Best parameters:", grid_search.best_params_)
print("Best estimator:", grid_search.best_estimator_)

# Evaluate on Test Set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = np.mean(y_test == y_pred) 
print("Accuracy on test set:", accuracy)
print("\nClassification Report:")
print(classification_report(testing_Labels, y_pred))

Fitting 2 folds for each of 4032 candidates, totalling 8064 fits
Best parameters: {'bootstrap': False, 'max_depth': 8, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 20}
Best estimator: RandomForestClassifier(bootstrap=False, max_depth=8, min_samples_leaf=5,
                       min_samples_split=15, n_estimators=20)
Accuracy on test set: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.74      0.73        50
           1       0.73      0.70      0.71        50

    accuracy                           0.72       100
   macro avg       0.72      0.72      0.72       100
weighted avg       0.72      0.72      0.72       100

