## **Installs + Drive Mount**

In [2]:
!pip install librosa numpy
!pip install pytube
!pip install matplotlib --upgrade



In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Function Definitions**

In [3]:
# importing packages
import librosa
import matplotlib.pyplot as plt
import librosa.display
import numpy as np
from pytube import YouTube
import os
from PIL import Image
import torch
import torch.nn as nn
from torchvision import transforms, models
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import time
import glob

class AI_Detection_Model(nn.Module):
    def __init__(self, num_classes, num_layers, pretrained=True):
        super(AI_Detection_Model, self).__init__()
        if num_layers == 18:
            self.resnet = models.resnet18(weights='ResNet18_Weights.IMAGENET1K_V1')
        elif num_layers == 34:
            self.resnet = models.resnet34(weights='ResNet34_Weights.IMAGENET1K_V1')
        elif num_layers == 50:
            self.resnet = models.resnet50(weights='ResNet50_Weights.IMAGENET1K_V1')
            
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        x = self.resnet(x)
        return x

def crop_image(file):
    # Opens a image in RGB mode
    im = Image.open(file)

    # Size of the image in pixels (size of original image)
    # (This is not mandatory)
    width, height = im.size

    # Setting the points for cropped image
    left = 0
    top = height // 9
    right = width
    bottom = height

    # Cropped image of above dimension
    # (It will not change original image)
    im1 = im.crop((left, top, right, bottom))

    # Shows the image in image viewer
    im1 = im1.save(file)

# Function to convert audio file to spectrogram image and save as JPEG
def audio_to_spectrogram(audio_file, output_file):
    # Load the audio file
    audio, sr = librosa.load(audio_file, duration=30.0)  # Specify duration of 30 seconds

    # Create a spectrogram image
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    plt.figure(figsize=(10, 4))
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')

    # Save the spectrogram as a JPEG image
    plt.savefig(output_file, format='jpg')

    # Close the plot
    plt.close()
    
    crop_image(output_file)


def vid_to_mp3(link, destination) :
    # url input from user
    yt = YouTube(link)

    # extract only audio
    video = yt.streams.filter(only_audio=True).first()

    # download the file
    name = yt.author + ' - ' + yt.title
    out_file = video.download(output_path=destination, filename=name)

    # save the file
    base, ext = os.path.splitext(out_file)
    new_file = base + '.mp3'
    os.rename(out_file, new_file)

    # result of success

    return new_file, name

# Define the function for testing a single image
def test_single_image(image_path, transform):
    # Load and preprocess the image
    image = Image.open(image_path)

    input_image = transform(image)

    # Prepare the input tensor
    input_tensor = input_image.unsqueeze(0)  # Add an extra dimension to represent the batch (batch_size=1)

    # Move the image to the device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    input_tensor = input_tensor.to(device)

    model.eval()

    # Forward pass the input through the model
    output = model(input_tensor)

    # Interpret the output
    probabilities = torch.softmax(output, dim=1)

    # Extract the predicted class
    _, predicted_class = torch.max(probabilities, 1)

    return "AI Music" if predicted_class.item() == 0 else 'Real Music'

# Define transformations for the input images
transform = transforms.Compose([
#     transforms.Resize((1000, 400)),
#     transforms.ToTensor()
    transforms.Resize((560, 224)),
    transforms.ToTensor(),
#     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <9A4710B9-0DA3-36BB-9129-645F282E64B2> /Users/shriyanssairy/anaconda3/lib/python3.10/site-packages/torchvision/image.so
  warn(


# **Download MP3**

In [40]:
output = vid_to_mp3('https://youtu.be/q6ISROYswBQ', '/content/drive/MyDrive/Separate Images/Test MP3s')

print(output[0],"has successfully downloaded")


/content/drive/MyDrive/Separate Images/Test MP3s/ARTZI - See You Again - Juice WRLD Ai  & xxxtentacion Ai.mp3 has successfully downloaded


In [34]:

link = 'https://youtu.be/q6ISROYswBQ'
mp3_output_folder = '/content/drive/MyDrive/Separate Images/MP3s'

mp3_path, song_name = vid_to_mp3(link, mp3_output_folder)
print("Saved",mp3_path)

label = 'AI'
image_path1 = f'/content/drive/MyDrive/mihirs music images/{label} Images/spectrogram_{song_name}.jpg'
image_path2 = f'/content/drive/MyDrive/Separate Images/Images/{song_name}.jpg'

audio_to_spectrogram('/content/drive/MyDrive/Separate Images/MP3s/The Weeknd - Out of Time (Audio).mp3', image_path1)

print('\n' + image_path1)




Saved /content/drive/MyDrive/Separate Images/MP3s/ARTZI - See You Again - Juice WRLD Ai  & xxxtentacion Ai.mp3

/content/drive/MyDrive/mihirs music images/AI Images/spectrogram_ARTZI - See You Again - Juice WRLD Ai  & xxxtentacion Ai.jpg


In [26]:
import os

os.remove(image_path1)

# **Convert All MP3s to Spectrograms**

In [4]:
# Folder path containing subfolders with audio files
audio_folder_path = '/Users/shriyanssairy/Desktop/Test MP3s'

# New folder path for storing the spectrogram images and features
image_folder_path = '/Users/shriyanssairy/Desktop/Test Images'

# Create subfolders for storing AI and Real spectrogram images and features
ai_folder = os.path.join(image_folder_path, 'AI Images')
real_folder = os.path.join(image_folder_path, 'Real Images')


os.makedirs(ai_folder, exist_ok=True)
os.makedirs(real_folder, exist_ok=True)


# Parameters
image_size = (224, 224)  # Size of the resized spectrogram image

# Helper function to resize the image
def resize_image(image, size):
    pil_image = Image.fromarray(image)
    resized_image = pil_image.resize(size)
    return np.array(resized_image)

# Iterate through subfolders in the audio folder
for subfolder in os.listdir(audio_folder_path):
    subfolder_path = os.path.join(audio_folder_path, subfolder)

    if os.path.isdir(subfolder_path):
        # Determine the target folder for storing spectrogram images and features
        if subfolder == 'AI Music':
            target_folder = ai_folder
        elif subfolder == 'Real Music':
            target_folder = real_folder
        else:
            continue  # Skip subfolders not labeled as AI Music or Real Music

        # Process audio files in the subfolder
        audio_files = glob.glob(os.path.join(subfolder_path, '*.mp3'))

        for audio_file in audio_files:
            # Extract the audio file name without the extension
            audio_filename = os.path.splitext(os.path.basename(audio_file))[0]

            spectrogram_save_path = os.path.join(target_folder, 'spectrogram_{}.jpg'.format(audio_filename))
            audio_to_spectrogram(audio_file, spectrogram_save_path)

# **Train New Model**

In [8]:
start = time.time()

num_layers = 18
image_folder_path = "/Users/shriyanssairy/Desktop/mihirs music images copy"
test_folder_path = "/Users/shriyanssairy/Desktop/Test Images"

device = torch.device('mps')

# Load a pre-trained model (e.g., ResNet)
model = AI_Detection_Model(num_classes=2, num_layers=num_layers)

# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#Load previous model (Comment out if you want to train a new model)
prev_model_path = "/Users/shriyanssairy/Desktop/Saved Models/saved_resnet18_1691869340.2124429(78%).pt"
model.load_state_dict(torch.load(prev_model_path, map_location=device))

model = model.to(device)
model.train()

# Load the dataset
train_dataset = ImageFolder(root=image_folder_path, transform=transform)
test_dataset = ImageFolder(root=test_folder_path, transform=transform)
class_names = train_dataset.classes
num_classes = len(class_names)

print(len(train_dataset))

# Split the dataset into training and testing sets
# train_size = int(0.9 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Create data loaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training loop
num_epochs = 200
for epoch in range(num_epochs):
    epoch_start_time = time.time()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Compute training accuracy
        _, predicted = torch.max(outputs.data, 1)

        total_predictions += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

        running_loss += loss.item()

    # Compute average training loss and accuracy for the epoch
    train_loss = running_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions
    epoch_end_time = time.time()
    total_epoch_time = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {train_loss:.4f} - Accuracy: {train_accuracy:.4f} - ET: {total_epoch_time//3600:.0f}h {total_epoch_time//60:.0f}m {round(total_epoch_time%60)}s")

# Evaluation loop
model.eval()
correct = 0.0
total = 0.0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
            
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        print(f"total = {total}, correct = {correct}")

# Print accuracy on the test set
print(f"Test Accuracy: {(correct/total) * 100}%")

model_path = f'/Users/shriyanssairy/Desktop/Saved Models/saved_resnet{num_layers}_{start}({(correct/total) * 100:.0f}%).pt'
torch.save(model.state_dict(), model_path)


end = time.time()
total_time = end-start
print(f'Elapsed Time: {total_time//3600:.0f}h {total_time//60:.0f}m {round(total_time%60)}s')


221
Epoch 1/200 - Loss: 0.0324 - Accuracy: 0.9910 - ET: 0h 0m 8s
Epoch 2/200 - Loss: 0.0146 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 3/200 - Loss: 0.0122 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 4/200 - Loss: 0.0095 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 5/200 - Loss: 0.0126 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 6/200 - Loss: 0.0101 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 7/200 - Loss: 0.0993 - Accuracy: 0.9819 - ET: 0h 0m 7s
Epoch 8/200 - Loss: 0.0568 - Accuracy: 0.9774 - ET: 0h 0m 8s
Epoch 9/200 - Loss: 0.0360 - Accuracy: 0.9774 - ET: 0h 0m 8s
Epoch 10/200 - Loss: 0.0164 - Accuracy: 0.9910 - ET: 0h 0m 7s
Epoch 11/200 - Loss: 0.0123 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 12/200 - Loss: 0.0117 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 13/200 - Loss: 0.0102 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 14/200 - Loss: 0.0105 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 15/200 - Loss: 0.0106 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 16/200 - Loss: 0.1620 - Accuracy: 0.9683 - ET: 0h 0m 7s
Epoch 17/200 

Epoch 133/200 - Loss: 0.0080 - Accuracy: 0.9910 - ET: 0h 0m 7s
Epoch 134/200 - Loss: 0.0081 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 135/200 - Loss: 0.0085 - Accuracy: 0.9910 - ET: 0h 0m 7s
Epoch 136/200 - Loss: 0.0085 - Accuracy: 0.9910 - ET: 0h 0m 7s
Epoch 137/200 - Loss: 0.0071 - Accuracy: 0.9910 - ET: 0h 0m 7s
Epoch 138/200 - Loss: 0.0066 - Accuracy: 0.9955 - ET: 0h 0m 7s
Epoch 139/200 - Loss: 0.0075 - Accuracy: 0.9955 - ET: 0h 0m 8s
Epoch 140/200 - Loss: 0.0069 - Accuracy: 0.9955 - ET: 0h 0m 8s
Epoch 141/200 - Loss: 0.0073 - Accuracy: 0.9955 - ET: 0h 0m 9s
Epoch 142/200 - Loss: 0.0075 - Accuracy: 0.9955 - ET: 0h 0m 9s
Epoch 143/200 - Loss: 0.0074 - Accuracy: 0.9955 - ET: 0h 0m 9s
Epoch 144/200 - Loss: 0.0076 - Accuracy: 0.9910 - ET: 0h 0m 9s
Epoch 145/200 - Loss: 0.0080 - Accuracy: 0.9910 - ET: 0h 0m 9s
Epoch 146/200 - Loss: 0.0076 - Accuracy: 0.9955 - ET: 0h 0m 9s
Epoch 147/200 - Loss: 0.0095 - Accuracy: 0.9910 - ET: 0h 0m 9s
Epoch 148/200 - Loss: 0.0070 - Accuracy: 0.9955 - ET: 0

# **Test Model**

In [5]:
device = torch.device('mps')
model_path = '/Users/shriyanssairy/Desktop/Saved Models/saved_resnet18_1691795939.62897(87%).pt'
num_layers = 18
model = AI_Detection_Model(num_classes=2, num_layers=num_layers)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval();

# Provide the input MP3 file path and output spectrogram image path
input_audio_path = "/Users/shriyanssairy/Desktop/Test MP3s/Drake AI - Wide Open (Official Audio).mp3"
output_image_path = '/Users/shriyanssairy/Desktop/spectrogram.jpg'

# Convert the audio file to spectrogram image and save as JPEG
audio_to_spectrogram(input_audio_path, output_image_path)

print("Spectrogram image saved successfully.")

predicted_class = test_single_image(output_image_path, transform)
print(f"Predicted class: {predicted_class}")

  audio, sr = librosa.load(audio_file, duration=30.0)  # Specify duration of 30 seconds
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/shriyanssairy/Desktop/Test MP3s/Drake AI - Wide Open (Official Audio).mp3'

In [9]:
#Test model with all files in Test MP3s folder

device = torch.device('mps')
model_path = '/Users/shriyanssairy/Desktop/Saved Models/saved_resnet18_1691910656.348921(78%).pt'
num_layers = 18
model = AI_Detection_Model(num_classes=2, num_layers=num_layers)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval();

spectrogram_path = '/Users/shriyanssairy/Desktop/spectrogram.jpg'
print("hello")

for file in glob.glob(os.path.join('/Users/shriyanssairy/Desktop/Test MP3s', '*.mp3')):
    audio_to_spectrogram(file, spectrogram_path)
    predicted_class = test_single_image(spectrogram_path, transform)
    print(f"{predicted_class} --- {os.path.splitext(file)[0]}")

hello


In [30]:

spectrogram = '/content/drive/MyDrive/Separate Images/Images/spectrogram.jpg'

# Test a single image
test_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
predicted_class = test_single_image(spectrogram, transform)
print(f"Predicted class: {predicted_class}")

Predicted class: Real Music
