# Read from Query Resultset

In [20]:
import numpy as np
import pandas as pd

# get and preprocess wikidata
cols = ["link", "name", "sitelinks"]
data_groups = pd.read_json("../data/wikidata/groups.json")
data_groups.musicalGroupLabel = data_groups.musicalGroupLabel.str.lower()
data_groups.columns = cols
data_groups["type"] = "Group"


# Read from Dump

In [184]:
import numpy as np
import pandas as pd

# read dataset
data = pd.read_parquet("../data/dataset/shs100k2/data.parquet")
data["performer_single"] = data.performer.apply(lambda x: x[0] if len(x) > 0 else '')
cols = ['set_id', 'yt_id', 'title', 'performer', 'performer_single', 'viewcount', 'split', 'TEXT', 'IOB']
data = data[cols]


In [185]:
def preprocess_dump(data: pd.DataFrame) -> pd.DataFrame:
    # basic preprocessing
    def get_english_name_or_first(label_dict):
        en_name = label_dict.get("en")
        if en_name:
            name = en_name.get("value").lower()
        else:
            other_names = list(label_dict.values())
            if len(other_names) > 0:
                name = list(label_dict.values())[0]["value"]
            else:
                name = ""
        return name
    data["name"] = data.labels.apply(get_english_name_or_first)
    data["nsitelinks"] = data.sitelinks.apply(len)
    data = data[["id", "name", "nsitelinks"]]
    data["type"] = "Work"
    
    # rank
    def get_rank(data: pd.DataFrame, column: str) -> pd.Series:
        data = data.sort_values(by=column, ascending=False)
        ranks = data[column].rank(method='min', ascending=False)
        return ranks
    data["rank"] = get_rank(data, "nsitelinks")

    #  exposure
    S = len(data)
    def exposure(rank):
        return np.log(S) - np.log(rank)
    data["exposure"] = data["rank"].apply(exposure)

    return data

# groups
data_groups = pd.read_json("../data/wikidata/groups.jsonl")
data_groups = preprocess_dump(data_groups)

# humans
data_humans = pd.read_json("../data/wikidata/soloartists.jsonl")
data_humans = preprocess_dump(data_humans)

data_wikidata = pd.concat([data_groups, data_humans], axis=0, ignore_index=True).drop_duplicates(subset="id")
data_wikidata


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["type"] = "Work"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["rank"] = get_rank(data, "nsitelinks")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["exposure"] = data["rank"].apply(exposure)


ValueError: Expected object or value

In [None]:
data_wikidata[["type"]].groupby("type").value_counts()


type
Group      10000
Person    119054
Name: count, dtype: int64

In [None]:
data = pd.merge(data, data_wikidata, how="left", left_on="performer_single", right_on="name")


# Works

In [179]:
import pandas as pd

data_works = pd.read_json("../data/wikidata/works.jsonl").dropna(subset="labels")


In [183]:

data = preprocess_dump(data_works)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["type"] = "Work"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["rank"] = get_rank(data, "nsitelinks")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["exposure"] = data["rank"].apply(exposure)
