# Identify most popular tags across countries

In [1]:
# common imports
import random
import json
import csv
from os import getenv

import numpy as np
import pandas as pd
from pymongo import MongoClient

In [2]:
mongo_uri = f"mongodb://{getenv('MONGO_USERNAME')}:{getenv('MONGO_PASSWORD')}@youtube-trends-mongodb:27017"
client = MongoClient(mongo_uri)
print(client.list_database_names())

['admin', 'config', 'local', 'youtube_trend_db']


In [3]:
db = client.youtube_trend_db

In [23]:
def get_tags(data, country):

    names = []
    for raw_tag in data["tags"]:
        tag_names = raw_tag.split("|")
        tag_names = [item.lower().strip() for item in raw_tag.split("|") if item]

        names.extend(tag_names)

    np_names = np.array(names)
    tags, counts = np.unique(names, return_counts=True)

    return dict(zip(tags, counts))

response = {"name": "Top Tags", "children": []}
countries = [ 'us']

for country in countries:
    data = pd.DataFrame.from_records(db[country].find({}))
    
    tags = get_tags(data, country)
    data = {"name": list(tags.keys()), "count": list(tags.values())}

    df = pd.DataFrame(data=data)
    df = df.sort_values(by=['count'], ascending=False)
    df = df.head(20)
    
    response["children"].append({"name": country, "children": df.to_dict(orient="records")})
    
print(response)

{'name': 'Top Tags', 'children': [{'name': 'us', 'children': [{'name': '[none]', 'count': 3614}, {'name': 'funny', 'count': 1457}, {'name': 'minecraft', 'count': 1015}, {'name': 'comedy', 'count': 933}, {'name': 'football', 'count': 589}, {'name': 'challenge', 'count': 580}, {'name': 'rap', 'count': 568}, {'name': 'highlights', 'count': 568}, {'name': 'news', 'count': 554}, {'name': 'vlog', 'count': 515}, {'name': 'music', 'count': 506}, {'name': 'nba', 'count': 485}, {'name': 'tiktok', 'count': 451}, {'name': 'hip hop', 'count': 448}, {'name': 'gaming', 'count': 436}, {'name': 'sports', 'count': 408}, {'name': 'fortnite', 'count': 400}, {'name': 'animation', 'count': 398}, {'name': 'basketball', 'count': 366}, {'name': 'family', 'count': 357}]}]}
