In [1]:
uri = "https://api.nasheedio.com/api/logs?fields[0]=id&fields[1]=createdAt&filters[type][$eq]=audio&populate[user][fields][0]=id&populate[audio][fields][0]=id&populate[audio][fields][1]=title"


In [2]:
from fetch_data import fetch_paginated_data
from utils import get_latest_file, log_message

import pandas as pd
import os
from datetime import datetime
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.neighbors import NearestNeighbors
import joblib

In [3]:
now = datetime.now()
timestamp_str = now.strftime("%Y%m%d_%H%M%S")
week_number = now.isocalendar()[1]
year = now.year

In [4]:
LOG_FILE = "./logs/processing.log"
MODEL_DIR = "./models/"
DATA_DIR = "./data/"
MAP_FOLDER = './map_data/'
MATRIX_FOLDER = './matrix_data/'

In [5]:
for folder in [MODEL_DIR, DATA_DIR, MAP_FOLDER, MATRIX_FOLDER]:
    if not os.path.exists(folder):
        os.makedirs(folder)

In [6]:
# Create the directory if it doesn't exist
os.makedirs(MAP_FOLDER, exist_ok=True)
log_dir = os.path.dirname(LOG_FILE)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)
    print(f"Created log directory: {log_dir}")

Created log directory: ./logs


In [7]:
def fetch_and_save_weekly_data():
    log_message("Fetching data from API...")
    logs_data = fetch_paginated_data(uri)
    log_message(f"Successfully fetched {len(logs_data)} records from the API.")
    json_logs_data = []
    for item in logs_data:
        log_id = item['id']
        if item["attributes"]['user']['data'] is None or item["attributes"]['audio']['data'] is None:
            continue
        user_id = item["attributes"]['user']['data']['id']
        audio_id = item["attributes"]['audio']['data']['id']
        json_logs_data.append({
            "log_id": log_id,
            "user_id": user_id,
            "audio_id": audio_id,
            "title": item["attributes"]['audio']['data']['attributes']['title'],
            "createdAt": item["attributes"]['createdAt']
        })
    df = pd.DataFrame(json_logs_data)

    filename = f"{DATA_DIR}/logs_{year}_week{week_number}_{timestamp_str}.parquet"
    df.to_parquet(filename, index=False)
    log_message(f"Saved data to Parquet file: {filename}")
    return filename

In [8]:
def read_latest_weekly_data(filename = None):
    if filename:
        log_message(f"Reading data from specified Parquet file: {filename}")
        return  pd.read_parquet(filename)
    else:
        most_recent_file = get_latest_file(DATA_DIR, 'logs', 'parquet')
        if most_recent_file:
            log_message(f"Reading the latest Parquet file: {most_recent_file}")
            try:
                return pd.read_parquet(most_recent_file)
            except Exception as e:
                log_message(f"Error reading Parquet file {most_recent_file}: {e}")
                return None
        else:
            return None

MAKE SURE TO CHANGE IF YOU HAVE DATA ALREADY DOWNLOADED IN THE DIR
- log_file = fetch_and_save_weekly_data()
- latest_df = read_latest_weekly_data(log_file)
- latest_df = read_latest_weekly_data()

In [9]:
log_file = fetch_and_save_weekly_data()
latest_df = read_latest_weekly_data(log_file)

2025-05-16 14:45:04,790 - INFO - Total pages: 40 | Total records: 398073
2025-05-16 14:45:18,106 - INFO - Page 2 processed successfully.
2025-05-16 14:45:32,583 - INFO - Page 3 processed successfully.
2025-05-16 14:45:47,491 - INFO - Page 4 processed successfully.
2025-05-16 14:46:02,152 - INFO - Page 5 processed successfully.
2025-05-16 14:46:16,883 - INFO - Page 6 processed successfully.
2025-05-16 14:46:32,076 - INFO - Page 7 processed successfully.
2025-05-16 14:46:46,659 - INFO - Page 8 processed successfully.
2025-05-16 14:47:01,718 - INFO - Page 9 processed successfully.
2025-05-16 14:47:17,005 - INFO - Page 10 processed successfully.
2025-05-16 14:47:32,667 - INFO - Page 11 processed successfully.
2025-05-16 14:47:47,789 - INFO - Page 12 processed successfully.
2025-05-16 14:48:02,403 - INFO - Page 13 processed successfully.
2025-05-16 14:48:17,167 - INFO - Page 14 processed successfully.
2025-05-16 14:48:31,744 - INFO - Page 15 processed successfully.
2025-05-16 14:48:47,163 -

In [10]:
latest_df.head()

Unnamed: 0,log_id,user_id,audio_id,title,createdAt
0,1,1475,51450,JAAN-E-GAZAB (Vocals only),2024-12-30T20:21:36.390Z
1,2,1475,604,Ya Aqsa,2024-12-30T20:30:39.000Z
2,17,2333,51450,JAAN-E-GAZAB (Vocals only),2025-01-02T12:14:01.832Z
3,18,2381,22692,Salam us par ke jisne bekaso ki | Reel,2025-01-02T12:16:36.017Z
4,19,2381,22691,Ek din Jibreel se kehne lage | The City School...,2025-01-02T12:17:06.251Z


In [11]:
df = latest_df.copy()
df = df[['user_id', 'audio_id']]
df.head()
df['user_id'] = pd.to_numeric(df['user_id'])
df['audio_id'] = pd.to_numeric(df['audio_id'])

In [12]:
# Aggregate play counts
log_message("Processing interaction data...")
interactions = df.groupby(['user_id', 'audio_id']).size().reset_index(name='count')
interactions['confidence'] = 1 + 40 * np.log1p(interactions['count'])

# Filter out audios with only one listener
item_user_counts = interactions.groupby('audio_id')['user_id'].nunique()
valid_audio_ids = item_user_counts[item_user_counts > 0].index
interactions = interactions[interactions['audio_id'].isin(valid_audio_ids)]

# Map user/audio IDs to indices
user_map = {id: i for i, id in enumerate(interactions['user_id'].unique())}
item_map = {id: i for i, id in enumerate(interactions['audio_id'].unique())}
inv_item_map = {v: k for k, v in item_map.items()}

ITEM_MAP_FILE = f"{MAP_FOLDER}/item_map_{year}_week{week_number}_{timestamp_str}.pkl"
INV_ITEM_MAP_FILE = f"{MAP_FOLDER}/inv_item_map_{year}_week{week_number}_{timestamp_str}.pkl"


joblib.dump(item_map, ITEM_MAP_FILE)
log_message(f"Saved item_map to: {ITEM_MAP_FILE}")
joblib.dump(inv_item_map, INV_ITEM_MAP_FILE)
log_message(f"Saved inv_item_map to: {INV_ITEM_MAP_FILE}")



interactions['user_index'] = interactions['user_id'].map(user_map)
interactions['item_index'] = interactions['audio_id'].map(item_map)

# Create sparse item-user matrix
log_message("Creating sparse matrix...")
matrix = coo_matrix(
    (interactions['confidence'], (interactions['item_index'], interactions['user_index']))
).tocsr()

MATRIX_FILE = f"{MATRIX_FOLDER}/matrix_{year}_week{week_number}_{timestamp_str}.pkl"
joblib.dump(matrix, MATRIX_FILE)
log_message(f"Saved matrix to: {MATRIX_FILE}")

In [13]:
log_message("Training ALS model...")
als_model = AlternatingLeastSquares(factors=100, regularization=0.1, iterations=20, num_threads=0) # Added num_threads=0 for reproducibility
als_model.fit(matrix)
als_model_name = f"{MODEL_DIR}/als_model_{year}_week{week_number}_{timestamp_str}.pkl"
joblib.dump(als_model, als_model_name)
log_message(f"Trained and saved ALS model: {als_model_name}")

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
log_message("Training KNN fallback model...")
knn_model = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=10)
knn_model.fit(matrix)
knn_model_name = f"{MODEL_DIR}/knn_model_{year}_week{week_number}_{timestamp_str}.pkl"
joblib.dump(knn_model, knn_model_name)
log_message(f"Trained and saved KNN model: {knn_model_name}")

In [16]:
def recommend_similar_audios(audio_id: int, top_n: int = 5) -> dict:
    """Return top N similar audio IDs using ALS with KNN fallback."""
    als_model_path = get_latest_file('models', 'als', 'pkl')
    knn_model_path = get_latest_file('models', 'knn', 'pkl')

    if not als_model_path or not knn_model_path:
        log_message("Could not load the latest ALS or KNN model.")
        return {'data': []}

    try:
        als_model = joblib.load(als_model_path)
        knn_model = joblib.load(knn_model_path)
        log_message(f"Loaded ALS model from: {als_model_path}")
        log_message(f"Loaded KNN model from: {knn_model_path}")
    except Exception as e:
        log_message(f"Error loading models: {e}")
        return {'data': []}

    if audio_id not in item_map:
        log_message(f"Audio ID {audio_id} not found in training data.")
        return {'data': []}

    item_index = item_map[audio_id]

    # Try ALS-based recommendations
    try:
        item_indices, _ = als_model.similar_items(item_index, N=top_n + 1)
        recs = [inv_item_map[int(idx)] for idx in item_indices if int(idx) != item_index and int(idx) in inv_item_map]
        if recs:
            return {'data':
                [int(x) for x in recs[:top_n]]
            }
    except Exception as e:
        log_message(f"ALS failed for Audio ID {audio_id}: {e}, falling back to KNN.")

    # Fallback to KNN if ALS fails or yields no useful results
    try:
        distances, indices = knn_model.kneighbors(matrix[item_index], n_neighbors=top_n + 1)
        recs = [inv_item_map[int(i)] for i in indices.flatten() if int(i) != item_index and int(i) in inv_item_map]
        return {'data':
                [int(x) for x in recs[:top_n]]
            }
    except Exception as e:
        log_message(f"KNN also failed for Audio ID {audio_id}: {e}.")
        return {'data': []}



In [17]:
recs = recommend_similar_audios(625)
print(recs)

{'data': [18795, 18794, 18800, 1384, 722]}
