## Initialisation

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Set Working Directory
%cd /content/drive/MyDrive/Projects/sharedtask-dravidianlangtech

/content/drive/MyDrive/Projects/sharedtask-dravidianlangtech


## Class Labels

In [10]:
def read_text(lang):

  import pandas as pd
  import os
  import numpy as np
  from numpy import random

  path = f"!data/train/{lang}/text"
  file = f"{path}/{os.listdir(path)[0]}"

  data = pd.read_excel(open(file,'rb'))

  if lang == "telugu":
    data[['label', 'details']] = data['File_Name'].str.split('_', n=1, expand=True)
  else:
    data[['label', 'details']] = data['File Name'].str.split('_', n=1, expand=True)

  data = data.sort_values('label')

  print(lang)
  print(data.groupby(['Class Label Short']).size())
  print(data.groupby(['label']).size())

In [11]:
read_text("malayalam")

malayalam
Class Label Short
C    186
G     82
N    406
P    118
R     91
dtype: int64
label
H     477
NH    406
dtype: int64


In [12]:
read_text("tamil")

tamil
Class Label Short
C     65
G     68
N    287
P     33
R     61
dtype: int64
label
H     227
NH    287
dtype: int64


In [13]:
read_text("telugu")

telugu
Class Label Short
C    122
G    106
N    198
P     58
R     72
dtype: int64
label
H     358
NH    198
dtype: int64


## Text

In [None]:
def read_text(lang):

  import pandas as pd
  import os
  import numpy as np
  from numpy import random
  import gensim
  from sklearn.model_selection import train_test_split
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  from sklearn.metrics import accuracy_score, confusion_matrix
  from sklearn.metrics import classification_report

  path = f"!data/train/{lang}/text"
  file = f"{path}/{os.listdir(path)[0]}"

  data = pd.read_excel(open(file,'rb'))

  if lang == "telugu":
    data[['label', 'details']] = data['File_Name'].str.split('_', n=1, expand=True)
  else:
    data[['label', 'details']] = data['File Name'].str.split('_', n=1, expand=True)

  data = data.sort_values('label')

  # Create train and test sets
  X = data.Transcript
  y = data['label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

  my_tags = list(set(y))

  # Naive Bayes Classifier for Multinomial Models
  from sklearn.naive_bayes import MultinomialNB
  from sklearn.pipeline import Pipeline
  from sklearn.feature_extraction.text import TfidfTransformer

  nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)
  nb_pred = nb.predict(X_test)
  print("Naive Bayes Classifier for Multinomial Models")
  print("\n")
  print(classification_report(y_test, nb_pred,target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Linear Support Vector Machine
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                ])
  sgd.fit(X_train, y_train)
  sgd_pred = sgd.predict(X_test)
  print("Linear Support Vector Machine")
  print("\n")
  print(classification_report(y_test, sgd_pred, target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=2, C=1e5)),
                ])
  logreg.fit(X_train, y_train)
  logreg_pred = logreg.predict(X_test)
  print("Logistic Regression")
  print("\n")
  print(classification_report(y_test, logreg_pred,target_names=my_tags,zero_division=np.nan))

  # Random Forest Classifier
  from sklearn.ensemble import RandomForestClassifier

  forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(max_depth=24, random_state=42)),
                ])
  forest.fit(X_train, y_train)
  forest_pred = forest.predict(X_test)
  print("Random Forest Classifier")
  print("\n")
  print(classification_report(y_test, forest_pred,target_names=my_tags,zero_division=np.nan))

In [None]:
read_text("malayalam")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.90      0.87      0.89       125
           H       0.84      0.88      0.86        96

    accuracy                           0.87       221
   macro avg       0.87      0.87      0.87       221
weighted avg       0.87      0.87      0.87       221



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.90      0.84      0.87       125
           H       0.81      0.88      0.84        96

    accuracy                           0.86       221
   macro avg       0.85      0.86      0.85       221
weighted avg       0.86      0.86      0.86       221



Logistic Regression


              precision    recall  f1-score   support

          NH       0.86      0.86      0.86       125
           H       0.81      0.82      0.82        96

    accuracy                           0.84       221
   macro avg       0.8

In [None]:
read_text("tamil")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.94      0.48      0.64        60
           H       0.68      0.97      0.80        69

    accuracy                           0.74       129
   macro avg       0.81      0.73      0.72       129
weighted avg       0.80      0.74      0.73       129



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.78      0.77      0.77        60
           H       0.80      0.81      0.81        69

    accuracy                           0.79       129
   macro avg       0.79      0.79      0.79       129
weighted avg       0.79      0.79      0.79       129



Logistic Regression


              precision    recall  f1-score   support

          NH       0.76      0.70      0.73        60
           H       0.76      0.81      0.78        69

    accuracy                           0.76       129
   macro avg       0.7

In [None]:
read_text("telugu")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.74      0.96      0.84        94
           H       0.78      0.31      0.44        45

    accuracy                           0.75       139
   macro avg       0.76      0.63      0.64       139
weighted avg       0.75      0.75      0.71       139



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.78      0.79      0.78        94
           H       0.55      0.53      0.54        45

    accuracy                           0.71       139
   macro avg       0.66      0.66      0.66       139
weighted avg       0.70      0.71      0.70       139



Logistic Regression


              precision    recall  f1-score   support

          NH       0.81      0.79      0.80        94
           H       0.58      0.62      0.60        45

    accuracy                           0.73       139
   macro avg       0.7

In [None]:
def read_text(lang):

  import pandas as pd
  import os
  import numpy as np
  from numpy import random
  import gensim
  from sklearn.model_selection import train_test_split
  from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  from sklearn.metrics import accuracy_score, confusion_matrix
  from sklearn.metrics import classification_report

  path = f"!data/train/{lang}/text"
  file = f"{path}/{os.listdir(path)[0]}"

  data = pd.read_excel(open(file,'rb'))

  if lang == "telugu":
    data[['label', 'language', 'number', 'class_label', 'details']] = data['File_Name'].str.split('_', n=4, expand=True)
  else:
    data[['label', 'language', 'number', 'class_label', 'details']] = data['File Name'].str.split('_', n=4, expand=True)

  data = data.sort_values('class_label')

  # Create train and test sets
  X = data.Transcript
  y = data['class_label']
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

  my_tags = list(set(y))

  # Naive Bayes Classifier for Multinomial Models
  from sklearn.naive_bayes import MultinomialNB
  from sklearn.pipeline import Pipeline
  from sklearn.feature_extraction.text import TfidfTransformer

  nb = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)
  nb_pred = nb.predict(X_test)
  print("Naive Bayes Classifier for Multinomial Models")
  print("\n")
  print(classification_report(y_test, nb_pred,target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Linear Support Vector Machine
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                ])
  sgd.fit(X_train, y_train)
  sgd_pred = sgd.predict(X_test)
  print("Linear Support Vector Machine")
  print("\n")
  print(classification_report(y_test, sgd_pred, target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=2, C=1e5)),
                ])
  logreg.fit(X_train, y_train)
  logreg_pred = logreg.predict(X_test)
  print("Logistic Regression")
  print("\n")
  print(classification_report(y_test, logreg_pred,target_names=my_tags,zero_division=np.nan))

  # Random Forest Classifier
  from sklearn.ensemble import RandomForestClassifier

  forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(max_depth=24, random_state=42)),
                ])
  forest.fit(X_train, y_train)
  forest_pred = forest.predict(X_test)
  print("Random Forest Classifier")
  print("\n")
  print(classification_report(y_test, forest_pred,target_names=my_tags,zero_division=np.nan))

In [None]:
read_text("malayalam")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C       0.82      0.64      0.72        42
           N        nan      0.00      0.00        23
           P       0.58      1.00      0.73       107
           R       1.00      0.07      0.14        27
           G        nan      0.00      0.00        22

    accuracy                           0.62       221
   macro avg       0.80      0.34      0.32       221
weighted avg       0.70      0.62      0.51       221



Linear Support Vector Machine


              precision    recall  f1-score   support

           C       0.72      0.90      0.80        42
           N       0.50      0.13      0.21        23
           P       0.78      0.93      0.85       107
           R       0.45      0.33      0.38        27
           G       0.57      0.36      0.44        22

    accuracy                           0.71       221
   macro avg       0.60      0.53      0.54      

In [None]:
read_text("tamil")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C        nan      0.00      0.00        16
           N        nan      0.00      0.00        21
           P       0.53      1.00      0.70        69
           R        nan      0.00      0.00         7
           G        nan      0.00      0.00        16

    accuracy                           0.53       129
   macro avg       0.53      0.20      0.14       129
weighted avg       0.53      0.53      0.37       129



Linear Support Vector Machine


              precision    recall  f1-score   support

           C       0.22      0.12      0.16        16
           N       0.45      0.48      0.47        21
           P       0.80      0.83      0.81        69
           R       0.40      0.29      0.33         7
           G       0.45      0.62      0.53        16

    accuracy                           0.63       129
   macro avg       0.47      0.47      0.46      

In [None]:
read_text("telugu")

Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C       0.89      0.21      0.34        38
           N       0.67      0.22      0.33        27
           P       0.35      0.98      0.51        43
           R        nan      0.00      0.00        15
           G        nan      0.00      0.00        16

    accuracy                           0.40       139
   macro avg       0.63      0.28      0.24       139
weighted avg       0.62      0.40      0.32       139



Linear Support Vector Machine


              precision    recall  f1-score   support

           C       0.83      0.53      0.65        38
           N       0.55      0.63      0.59        27
           P       0.55      0.79      0.65        43
           R       0.57      0.27      0.36        15
           G       0.53      0.50      0.52        16

    accuracy                           0.60       139
   macro avg       0.61      0.54      0.55      

## Speech

In [None]:
def read_speech(lang):

  import os
  import librosa
  import numpy as np
  from sklearn.model_selection import train_test_split
  from sklearn.preprocessing import LabelEncoder
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.metrics import classification_report
  from tqdm import tqdm

  path = f"!data/train/{lang}/audio"
  dirs = os.listdir(path)
  dirs = sorted(dirs)

  audio_data = []
  target_labels = []

  for i in tqdm(dirs, total=len(dirs)):

    file = os.path.join(path, i)
    y, sr = librosa.load(file)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    spectrogram = spectrogram.T

    label, details = i.split('_', 1)

    audio_data.append(spectrogram)
    target_labels.append(label)

  # Encode target labels
  label_encoder = LabelEncoder()
  encoded_labels = label_encoder.fit_transform(target_labels)

  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(audio_data, encoded_labels, test_size=0.25, random_state=42)

  # Ensure all spectrograms have the same shape
  max_length = max([spec.shape[0] for spec in audio_data])
  X_train = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_train]
  X_test = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_test]

  # Convert to NumPy arrays
  X_train = np.array(X_train)
  X_test = np.array(X_test)

  # Convert the data to a flat 2D shape
  X_train_flat = X_train.reshape(X_train.shape[0], -1)
  X_test_flat = X_test.reshape(X_test.shape[0], -1)

  my_tags = list(set(target_labels))

  # Naive Bayes Classifier for Multinomial Models
  from sklearn.naive_bayes import MultinomialNB
  from sklearn.pipeline import Pipeline
  from sklearn.preprocessing import MinMaxScaler

  nb = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
  nb.fit(X_train_flat, y_train)
  nb_pred = nb.predict(X_test_flat)
  print("Naive Bayes Classifier for Multinomial Models")
  print("\n")
  print(classification_report(y_test, nb_pred,target_names=my_tags,zero_division=0))
  print("\n")

  # Linear Support Vector Machine
  from sklearn.linear_model import SGDClassifier

  sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
  sgd.fit(X_train_flat, y_train)
  sgd_pred = sgd.predict(X_test_flat)
  print("Linear Support Vector Machine")
  print("\n")
  print(classification_report(y_test, sgd_pred, target_names=my_tags,zero_division=0))
  print("\n")

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression(n_jobs=2, C=1e5)
  logreg.fit(X_train_flat, y_train)
  logreg_pred = logreg.predict(X_test_flat)
  print("Logistic Regression")
  print("\n")
  print(classification_report(y_test, logreg_pred,target_names=my_tags))
  print("\n")

  # Random Forest Classifier
  from sklearn.ensemble import RandomForestClassifier

  forest = RandomForestClassifier(max_depth=24, random_state=42)
  forest.fit(X_train_flat, y_train)
  forest_pred = forest.predict(X_test_flat)
  print("Random Forest Classifier")
  print("\n")
  print(classification_report(y_test, forest_pred,target_names=my_tags))

In [None]:
read_speech("malayalam")

100%|██████████| 883/883 [00:46<00:00, 19.13it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.78      0.65      0.71       125
           H       0.62      0.76      0.69        96

    accuracy                           0.70       221
   macro avg       0.70      0.70      0.70       221
weighted avg       0.71      0.70      0.70       221



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.66      0.96      0.78       125
           H       0.87      0.34      0.49        96

    accuracy                           0.69       221
   macro avg       0.76      0.65      0.64       221
weighted avg       0.75      0.69      0.65       221



Logistic Regression


              precision    recall  f1-score   support

          NH       0.92      0.91      0.92       125
           H       0.89      0.90      0.89        96

    accuracy                           0.90       221
   macro avg       0.9

In [None]:
read_speech("tamil")

100%|██████████| 509/509 [00:50<00:00, 10.06it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.50      0.48      0.49        54
           H       0.63      0.65      0.64        74

    accuracy                           0.58       128
   macro avg       0.57      0.57      0.57       128
weighted avg       0.58      0.58      0.58       128



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.56      0.72      0.63        54
           H       0.74      0.58      0.65        74

    accuracy                           0.64       128
   macro avg       0.65      0.65      0.64       128
weighted avg       0.66      0.64      0.64       128



Logistic Regression


              precision    recall  f1-score   support

          NH       0.63      0.59      0.61        54
           H       0.71      0.74      0.73        74

    accuracy                           0.68       128
   macro avg       0.6

In [None]:
read_speech("telugu")

100%|██████████| 551/551 [00:39<00:00, 14.08it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

          NH       0.75      0.72      0.74        94
           H       0.45      0.48      0.46        44

    accuracy                           0.64       138
   macro avg       0.60      0.60      0.60       138
weighted avg       0.65      0.64      0.65       138



Linear Support Vector Machine


              precision    recall  f1-score   support

          NH       0.82      0.66      0.73        94
           H       0.48      0.68      0.57        44

    accuracy                           0.67       138
   macro avg       0.65      0.67      0.65       138
weighted avg       0.71      0.67      0.68       138



Logistic Regression


              precision    recall  f1-score   support

          NH       0.77      0.86      0.81        94
           H       0.61      0.45      0.52        44

    accuracy                           0.73       138
   macro avg       0.6

In [None]:
def read_speech(lang):

  import os
  import librosa
  import numpy as np
  from sklearn.model_selection import train_test_split
  from sklearn.preprocessing import LabelEncoder
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.metrics import classification_report
  from tqdm import tqdm

  path = f"!data/train/{lang}/audio"
  dirs = os.listdir(path)
  dirs = sorted(dirs)

  audio_data = []
  target_labels = []

  for i in tqdm(dirs, total=len(dirs)):

    file = os.path.join(path, i)
    y, sr = librosa.load(file)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    spectrogram = spectrogram.T

    label, language, number, class_label, details = i.split('_', 4)

    audio_data.append(spectrogram)
    target_labels.append(class_label)

  # Encode target labels
  label_encoder = LabelEncoder()
  encoded_labels = label_encoder.fit_transform(target_labels)

  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(audio_data, encoded_labels, test_size=0.25, random_state=42)

  # Ensure all spectrograms have the same shape
  max_length = max([spec.shape[0] for spec in audio_data])
  X_train = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_train]
  X_test = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_test]

  # Convert to NumPy arrays
  X_train = np.array(X_train)
  X_test = np.array(X_test)

  # Convert the data to a flat 2D shape
  X_train_flat = X_train.reshape(X_train.shape[0], -1)
  X_test_flat = X_test.reshape(X_test.shape[0], -1)

  my_tags = list(set(target_labels))

  # Naive Bayes Classifier for Multinomial Models
  from sklearn.naive_bayes import MultinomialNB
  from sklearn.pipeline import Pipeline
  from sklearn.preprocessing import MinMaxScaler

  nb = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
  nb.fit(X_train_flat, y_train)
  nb_pred = nb.predict(X_test_flat)
  print("Naive Bayes Classifier for Multinomial Models")
  print("\n")
  print(classification_report(y_test, nb_pred,target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Linear Support Vector Machine
  from sklearn.linear_model import SGDClassifier

  sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
  sgd.fit(X_train_flat, y_train)
  sgd_pred = sgd.predict(X_test_flat)
  print("Linear Support Vector Machine")
  print("\n")
  print(classification_report(y_test, sgd_pred, target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression(n_jobs=2, C=1e5)
  logreg.fit(X_train_flat, y_train)
  logreg_pred = logreg.predict(X_test_flat)
  print("Logistic Regression")
  print("\n")
  print(classification_report(y_test, logreg_pred,target_names=my_tags,zero_division=np.nan))
  print("\n")

  # Random Forest Classifier
  from sklearn.ensemble import RandomForestClassifier

  forest = RandomForestClassifier(max_depth=24, random_state=42)
  forest.fit(X_train_flat, y_train)
  forest_pred = forest.predict(X_test_flat)
  print("Random Forest Classifier")
  print("\n")
  print(classification_report(y_test, forest_pred,target_names=my_tags,zero_division=np.nan))

In [None]:
read_speech("malayalam")

100%|██████████| 883/883 [00:54<00:00, 16.26it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C       0.41      0.48      0.44        42
           N       0.17      0.26      0.20        23
           P       0.68      0.74      0.71        96
           R       0.80      0.11      0.19        37
           G       0.15      0.17      0.16        23

    accuracy                           0.48       221
   macro avg       0.44      0.35      0.34       221
weighted avg       0.54      0.48      0.46       221



Linear Support Vector Machine


              precision    recall  f1-score   support

           C       0.69      0.79      0.73        42
           N       1.00      0.04      0.08        23
           P       0.84      0.94      0.89        96
           R       0.43      0.76      0.55        37
           G        nan      0.00      0.00        23

    accuracy                           0.69       221
   macro avg       0.74      0.50      0.45      

In [None]:
read_speech("tamil")

100%|██████████| 509/509 [00:48<00:00, 10.60it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C        nan      0.00      0.00        12
           N       0.12      0.19      0.15        21
           P       0.62      0.78      0.69        74
           R       0.50      0.12      0.20         8
           G       0.00      0.00      0.00        13

    accuracy                           0.49       128
   macro avg       0.31      0.22      0.21       128
weighted avg       0.45      0.49      0.44       128



Linear Support Vector Machine


              precision    recall  f1-score   support

           C        nan      0.00      0.00        12
           N        nan      0.00      0.00        21
           P       0.59      0.99      0.74        74
           R        nan      0.00      0.00         8
           G       0.00      0.00      0.00        13

    accuracy                           0.57       128
   macro avg       0.29      0.20      0.15      

In [None]:
read_speech("telugu")

100%|██████████| 551/551 [00:33<00:00, 16.38it/s]


Naive Bayes Classifier for Multinomial Models


              precision    recall  f1-score   support

           C       0.35      0.61      0.44        28
           N       0.30      0.09      0.14        33
           P       0.58      0.34      0.43        44
           R       0.21      0.64      0.32        14
           G       0.09      0.05      0.07        19

    accuracy                           0.33       138
   macro avg       0.31      0.35      0.28       138
weighted avg       0.36      0.33      0.30       138



Linear Support Vector Machine


              precision    recall  f1-score   support

           C       0.43      0.82      0.56        28
           N       0.28      0.27      0.28        33
           P       0.52      0.55      0.53        44
           R       1.00      0.07      0.13        14
           G       0.40      0.11      0.17        19

    accuracy                           0.43       138
   macro avg       0.53      0.36      0.33      

## Test

In [None]:
def read_test(lang, code):

  import os
  import librosa
  import numpy as np
  import pandas as pd
  from sklearn.model_selection import train_test_split
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.metrics import classification_report
  from tqdm import tqdm

  path = f"!data/train/{lang}/audio"
  dirs = os.listdir(path)

  audio_data = []
  target_labels = []

  # Convert .wav files to spectrogram
  for i in tqdm(dirs, total=len(dirs)):

    file = os.path.join(path, i)
    y, sr = librosa.load(file)

    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    spectrogram = spectrogram.T

    label, language, number, class_label, details = i.split('_', 4)

    audio_data.append(spectrogram)
    target_labels.append(class_label)

  mapping = {'N': 0, 'C': 1, 'P': 2, 'R': 3, 'G': 4}
  numeric_list = [mapping[label] for label in target_labels]

  # Split data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(audio_data, numeric_list, test_size=0.25, random_state=42)

  # Ensure all spectrograms have the same shape
  max_length = max([spec.shape[0] for spec in audio_data])
  X_train = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_train]
  X_test = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in X_test]

  # Convert to NumPy arrays
  X_train = np.array(X_train)
  X_test = np.array(X_test)

  # Convert the data to a flat 2D shape
  X_train_flat = X_train.reshape(X_train.shape[0], -1)
  X_test_flat = X_test.reshape(X_test.shape[0], -1)

  my_tags = ['N', 'C', 'P', 'R', 'G']

  # Logistic Regression
  from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression(n_jobs=2, C=1e5)
  logreg.fit(X_train_flat, y_train)
  logreg_pred = logreg.predict(X_test_flat)
  print("Logistic Regression")
  print("\n")
  print(classification_report(y_test, logreg_pred, target_names=my_tags, zero_division=np.nan))
  print("\n")

  test_path = f"!data/test/{lang}/audio"
  dirs = os.listdir(test_path)

  test_audio = []

  for i in tqdm(dirs, total=len(dirs)):

    file = os.path.join(test_path, i)
    y, sr = librosa.load(file)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)
    spectrogram = spectrogram.T

    test_audio.append(spectrogram)

  # Ensure all spectrograms have the same shape
  max_length = max([spec.shape[0] for spec in audio_data])
  test_data = [np.pad(spec, ((0, max_length - spec.shape[0]), (0, 0)), mode='constant') for spec in test_audio]

  # Convert to NumPy arrays
  test_data = np.array(test_data)

  # Convert the data to a flat 2D shape
  test_data_flat = test_data.reshape(test_data.shape[0], -1)

  # Run model on unseen test data
  logreg_test = logreg.predict(test_data_flat)

  # Map Short Class Labels
  mapping = {0: 'N', 1: 'C', 2: 'P', 3: 'R', 4: 'G'}
  test_pred = [mapping[label] for label in logreg_test]
  test_pred = list(test_pred)

  # Create Dataframe
  output = pd.DataFrame(list(zip(test_pred, dirs)), columns=['Class Label Short', 'File Name'])
  output[['File Name', 'Drop']] = output['File Name'].str.split('.', expand=True)
  output.drop('Drop', axis=1, inplace=True)

  # Read output file
  file_text = f"!data/test/{lang}/text/{code}-AT-test.xlsx"
  text = pd.read_excel(open(file_text,'rb'))

  # Rename file name if Telugu
  if lang == "telugu":
    output.rename(columns={'File Name': 'File_Name'}, inplace=True)
    df = output.merge(text, on='File_Name')
  else:
    df = output.merge(text, on='File Name')

  # Write .tsv file
  file_name = f'cantnlp/cantnlp_multimodal_{lang}.tsv'
  df.to_csv(file_name, sep="\t", encoding='utf-8', index=False)

In [None]:
read_test('malayalam', 'ML')

100%|██████████| 883/883 [00:51<00:00, 16.99it/s]


Logistic Regression


              precision    recall  f1-score   support

           N       0.89      0.96      0.92        98
           C       0.67      0.60      0.63        53
           P       0.53      0.61      0.57        28
           R       0.38      0.25      0.30        20
           G       0.27      0.27      0.27        22

    accuracy                           0.70       221
   macro avg       0.55      0.54      0.54       221
weighted avg       0.68      0.70      0.69       221





100%|██████████| 50/50 [00:04<00:00, 11.70it/s]


In [None]:
read_test('tamil', 'TA')

100%|██████████| 509/509 [01:09<00:00,  7.35it/s]


Logistic Regression


              precision    recall  f1-score   support

           N       0.77      0.79      0.78        72
           C       0.18      0.11      0.13        19
           P       0.33      0.29      0.31         7
           R       0.41      0.47      0.44        15
           G       0.30      0.40      0.34        15

    accuracy                           0.58       128
   macro avg       0.40      0.41      0.40       128
weighted avg       0.56      0.58      0.57       128





100%|██████████| 50/50 [00:04<00:00, 12.13it/s]


In [None]:
read_test('telugu', 'TE')

100%|██████████| 551/551 [00:38<00:00, 14.15it/s]


Logistic Regression


              precision    recall  f1-score   support

           N       0.64      0.76      0.69        46
           C       0.69      0.69      0.69        35
           P       0.36      0.42      0.38        12
           R       0.21      0.15      0.18        20
           G       0.45      0.36      0.40        25

    accuracy                           0.55       138
   macro avg       0.47      0.47      0.47       138
weighted avg       0.53      0.55      0.54       138





100%|██████████| 50/50 [00:02<00:00, 19.66it/s]
