# EDA - Data Distribution

## System Settings

In [1]:
!pip install -r requirements.txt



In [2]:
!python split_data.py run res/train.json

Reading data...

Total playlists: 115071
Splitting data...
Original train...
Original val...
Masked val...
Total: 23015, Song only: 6904, Song & Tags: 11508, Tags only: 3452, Title only: 1151


## Define functions

In [3]:
from utils import read_json

In [4]:
import enum


class QuestionType(enum.Enum):
    ALL = enum.auto()
    SONG_TAG = enum.auto()
    SONG_TITLE = enum.auto()
    TAG_TITLE = enum.auto()
    SONG_ONLY = enum.auto()
    TAG_ONLY = enum.auto()
    TITLE_ONLY = enum.auto()
    NOTHING = enum.auto()


def count_questions_by_type(questions):
    type_map = {
        # (songs, tags, title): question_type
        (True, True, True): QuestionType.ALL,
        (True, True, False): QuestionType.SONG_TAG,
        (True, False, True): QuestionType.SONG_TITLE,
        (False, True, True): QuestionType.TAG_TITLE,
        (True, False, False): QuestionType.SONG_ONLY,
        (False, True, False): QuestionType.TAG_ONLY,
        (False, False, True): QuestionType.TITLE_ONLY,
        (False, False, False): QuestionType.NOTHING,
    }

    counts = {t: 0 for t in QuestionType}

    for question in questions:
        songs = question['songs']
        tags = question['tags']
        title = question['plylst_title']

        has_songs = len(songs) > 0
        has_tags = len(tags) > 0
        has_title = title != ""

        question_type = type_map[has_songs, has_tags, has_title]
        counts[question_type] += 1

    return counts


def print_question_type_counts(counts):
    for t, n in counts.items():
        print(f"{t.name}: {n}")

In [5]:
def print_question_types(filename):
    questions = read_json(filename)
    counts = count_questions_by_type(questions)
    print_question_type_counts(counts)

## Check

In [6]:
print_question_types('res/train.json')

ALL: 115071
SONG_TAG: 0
SONG_TITLE: 0
TAG_TITLE: 0
SONG_ONLY: 0
TAG_ONLY: 0
TITLE_ONLY: 0
NOTHING: 0


In [7]:
print_question_types('res/val.json')

ALL: 0
SONG_TAG: 8975
SONG_TITLE: 0
TAG_TITLE: 2628
SONG_ONLY: 9661
TAG_ONLY: 2
TITLE_ONLY: 1745
NOTHING: 4


In [8]:
print_question_types('res/test.json')

ALL: 0
SONG_TAG: 4190
SONG_TITLE: 0
TAG_TITLE: 1232
SONG_ONLY: 4507
TAG_ONLY: 0
TITLE_ONLY: 809
NOTHING: 2


In [9]:
print_question_types('arena_data/questions/val.json')

ALL: 8859
SONG_TAG: 0
SONG_TITLE: 9550
TAG_TITLE: 2618
SONG_ONLY: 0
TAG_ONLY: 0
TITLE_ONLY: 1988
NOTHING: 0
