In [1]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [2]:
!pip3 install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [11]:
import torch
from transformers import AutoProcessor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict, load_metric
import librosa
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file and create a mapping dictionary
csv_path = '/content/chunk_labels.csv'
df = pd.read_csv(csv_path)
file_label_map = dict(zip(df['Filename'], df['Language']))

# Function to map label text to integer
label_to_id = {"English": 0, "Korean": 1}

# Load model and processor
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base", num_labels=2)

# Function to load and preprocess the audio file using librosa
def preprocess_data(batch):
    batch_input_values = []
    batch_labels = []

    for file_path in batch["file"]:
        # Read audio file with librosa
        audio_input, sr = librosa.load(file_path, sr=16_000)

        # Process audio file
        inputs = processor(audio_input, sampling_rate=sr, return_tensors="pt", padding=True)
        batch_input_values.append(inputs.input_values.squeeze().numpy())
        batch_labels.append(label_to_id[file_label_map[file_path.split("/")[-1]]])

    batch["input_values"] = batch_input_values
    batch["labels"] = batch_labels
    return batch

# Prepare dataset paths
audio_files = df['Filename'].tolist()
audio_files = ['segments/' + file for file in audio_files]  # Ensure correct path formation

# Split dataset into training and validation
train_files, val_files = train_test_split(audio_files, test_size=0.2, random_state=42)
train_df = pd.DataFrame(train_files, columns=['file'])
val_df = pd.DataFrame(val_files, columns=['file'])

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)

# Create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset.map(preprocess_data, batched=True),
    'validation': val_dataset.map(preprocess_data, batched=True)
})

# Define training arguments
training_args = TrainingArguments(
    output_dir="./wav2vec2-language-classification",
    per_device_train_batch_size=2,  # Keeping the small batch size
    gradient_accumulation_steps=2,  # Adjusted accumulation steps
    evaluation_strategy="epoch",
    num_train_epochs=30,  # Keeping the increased epochs
    save_steps=500,
    eval_steps=250,  # Keeping more frequent evaluation
    learning_rate=3e-5,  # Further reduced learning rate
    weight_decay=0.02,  # Adjusted weight decay
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    tokenizer=processor.feature_extractor,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'projector.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'classifier.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/598 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/598 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
0,No log,0.612607
2,No log,0.673904
4,0.657700,0.607944
6,0.531500,0.66626
8,0.531500,0.63088
10,0.443400,0.416847
12,0.443400,0.355202
14,0.385200,0.521346
16,0.336200,0.400956
18,0.336200,0.594171




TrainOutput(global_step=4470, training_loss=0.3677459955748829, metrics={'train_runtime': 1133.9521, 'train_samples_per_second': 15.821, 'train_steps_per_second': 3.942, 'total_flos': 1.2680533838893536e+17, 'train_loss': 0.3677459955748829, 'epoch': 29.9})

In [12]:
# Save the model
model.save_pretrained('./saved_wav2vec2_model')

# Save the processor
processor.save_pretrained('./saved_wav2vec2_model')


In [14]:
import torch
from transformers import AutoProcessor, Wav2Vec2ForSequenceClassification, Trainer
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from datasets import load_dataset

# 모델 및 프로세서 로드
model_path = './saved_wav2vec2_model'
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

# # 검증 데이터셋 로드 (여기서는 'validation'을 사용합니다)
# val_dataset = load_dataset("segments", split="validation")

# Trainer 초기화
trainer = Trainer(
    model=model,
    tokenizer=processor.feature_extractor
)

# 검증 데이터셋에 대한 예측 수행
predictions = trainer.predict(val_dataset)

# 예측 결과 추출 및 레이블 변환
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# 분류 리포트 생성
report = classification_report(true_labels, pred_labels, target_names=["English", "Korean"])
print("Classification Report:\n", report)

# 혼동 행렬 생성
conf_matrix = confusion_matrix(true_labels, pred_labels)
print("Confusion Matrix:\n", conf_matrix)


Classification Report:
               precision    recall  f1-score   support

     English       0.90      0.93      0.92        82
      Korean       0.91      0.88      0.90        68

    accuracy                           0.91       150
   macro avg       0.91      0.90      0.91       150
weighted avg       0.91      0.91      0.91       150

Confusion Matrix:
 [[76  6]
 [ 8 60]]


In [16]:
!pip3 install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [18]:
import librosa
import numpy as np
from pydub import AudioSegment, silence
from transformers import Wav2Vec2ForSequenceClassification, AutoProcessor

# Load the saved model and processor
model_path = './saved_wav2vec2_model'
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)

# Function to split audio on silence
def split_on_silence(audio_path, min_silence_len=500, silence_thresh=-40):
    audio = AudioSegment.from_file(audio_path, format="wav")
    chunks = silence.split_on_silence(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
    return chunks

# Function to predict language for each chunk
def predict_language(chunks):
    predictions = []

    for chunk in chunks:
        # Convert pydub AudioSegment to numpy array
        chunk_samples = np.array(chunk.get_array_of_samples())

        # Preprocess the chunk
        inputs = processor(chunk_samples, sampling_rate=16_000, return_tensors="pt", padding=True)

        # Predict
        with torch.no_grad():
            logits = model(inputs.input_values).logits
        pred_label_id = torch.argmax(logits, dim=1).numpy()[0]

        # Convert id to label name
        predictions.append("English" if pred_label_id == 0 else "Korean")

    return predictions

# Main function to process an audio file
def process_audio_file(file_path):
    chunks = split_on_silence(file_path)
    language_predictions = predict_language(chunks)
    return language_predictions

# Example usage
file_path = '/content/I_like_to_eat_korean_food_11.wav'
chunk_languages = process_audio_file(file_path)
print(chunk_languages)


RuntimeError: ignored