In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Create the map between the DBpedia categories and presentation categories:

In [2]:
presentation_dbpedia_map_file = "../../data/presentation_categories_filtered_with_dbpedia_map.txt"

In [3]:
dbpedia_cat_map = {}
with open(presentation_dbpedia_map_file, "r") as f:
    for cur_line in f:
        if "," in cur_line:
            base_present_cat = cur_line.split(" , ")[0].split(" ")[0].strip()
            dbpedia_cat_list = cur_line.split(" , ")[1].split(" ")
            for cur_cat in dbpedia_cat_list:
                dbpedia_cat_map[cur_cat.strip()] = base_present_cat

In [4]:
print(dbpedia_cat_map)

{'Animal': 'nature', 'Plant': 'nature', 'BodyOfWater': 'nature', 'FloweringPlant': 'nature', 'Artist': 'art', 'EducationalInstitution': 'education', 'Infrastructure': 'engineering', 'Cartoon': 'games', 'Comic': 'games', 'SportsTeam': 'sports', 'SportsEvent': 'sports', 'SportsTeamSeason': 'sports', 'Software': 'technology', 'CelestialBody': 'astrology-astronomy', 'Satellite': 'astrology-astronomy', 'PeriodicalLiterature': 'history', 'RouteOfTransportation': 'transportation', 'MusicalWork': 'music', 'Song': 'music', 'MusicalArtist': 'music'}


### Filter the DBpedia data:

In [5]:
def filterData(input_file_name, dbpedia_cat_map, sample_size=100, le=None):
    df_cur = pd.read_csv(input_file_name)
    df_cur = df_cur[df_cur["l2"].isin(list(dbpedia_cat_map.keys()))]
    df_cur["topic"] = df_cur.apply(lambda x: dbpedia_cat_map[x["l2"]], axis=1)
    df_cur = df_cur[["text", "topic"]]
    
    if le is None:
        le = LabelEncoder()
        le.fit(df_cur["topic"])
        df_cur["topic_label"] = le.transform(df_cur["topic"])
    else:
        df_cur["topic_label"] = le.transform(df_cur["topic"])
        
        
    # Downsample each class to less than sample_size
    topic_count_dict = dict(df_cur["topic"].value_counts())
    df_new = pd.DataFrame(columns=df_cur.columns)
    
    for key, val in topic_count_dict.items():
        cur_sample_size = min(val, sample_size)
        df_samp = df_cur[df_cur["topic"]==key].sample(cur_sample_size)
        df_new = pd.concat([df_new, df_samp])
    
    return df_new, le

In [6]:
train_data_path = "../../data/dbpedia_data/DBPEDIA_train.csv"
val_data_path = "../../data/dbpedia_data/DBPEDIA_val.csv"
test_data_path = "../../data/dbpedia_data/DBPEDIA_test.csv"

In [7]:
df_train, le = filterData(train_data_path, dbpedia_cat_map, sample_size=500, le=None)
df_train.to_csv("../../data/topic_classification_data/train.csv", index=False)

In [8]:
df_val, le = filterData(val_data_path, dbpedia_cat_map, sample_size=100, le=le)
df_val.to_csv("../../data/topic_classification_data/val.csv", index=False)

In [9]:
df_test, le = filterData(test_data_path, dbpedia_cat_map, sample_size=100, le=le)
df_test.to_csv("../../data/topic_classification_data/test.csv", index=False)

In [10]:
print(df_train.shape, df_val.shape, df_test.shape)

(5500, 3) (1100, 3) (1100, 3)
