## Getting topics from FITS dataset

In [1]:
## opening txt files inside a folder
import os

relative_path = '/fits/'
# cwd
current_path = os.getcwd()
absolute_path = current_path + relative_path
print(absolute_path)

# get all folders in the relative path and the files inside them
files=[]
for folder in os.listdir(absolute_path):
    # if path is a folder
    if os.path.isdir(os.path.join(absolute_path, folder)):
        folder_path = os.path.join(absolute_path, folder)
        for file in os.listdir(folder_path):
            if file.endswith('.txt'):
                files.append(os.path.join(folder_path, file))
    

/home/rlazzaroni/large_conspiracy_model/fits_personas/fits/


In [4]:
import pandas as pd
import json

# read all files
data = []
for file in files:
    with open(file, 'r') as f:
        # each line is a json
        for line in f:
            line_dict=json.loads(line)
            data.append(line_dict)

In [6]:
# create a dataframe
df = pd.DataFrame(data)

In [8]:
# n of domains
print(df['domain'].nunique())

56


In [13]:
# n of topics
print(df['generic_topic'].nunique())

348


In [21]:
# some random topics
print(df['generic_topic'].sample(10).values)

['cooking' 'sex in the city' 'United Nations' 'flowers'
 'Harvard University' 'American sights' 'picking schools' 'running'
 'Ken Burns' 'King Henry V']


In [23]:
# print some domains
print(df['domain'].sample(10).values)

['Sports' 'Cooking Recipes' 'Parenting' 'Philosophy/Psychology'
 'Relationship' 'Travel' 'Literature' 'History' 'Cooking Recipes'
 'Science/STEM']


In [33]:
# get df generic topics and domains
df_topics = df[['generic_topic', 'domain']].drop_duplicates()

In [37]:
# get counts
print(f"There are {df_topics.shape[0]} unique pairs of generic topics and domains")
print(f"N topics: {df['generic_topic'].nunique()}")
print(f"N domains: {df['domain'].nunique()}")

There are 379 unique pairs of generic topics and domains
N topics: 348
N domains: 56


In [38]:
# get topics by domain

df_topics.groupby('domain')['generic_topic'].size().sort_values(ascending=False)

domain
History                         30
Science/STEM                    21
Cooking Recipes                 17
Travel                          17
Home deco/repairs               15
Gardening                       15
Fitness                         15
Movie                           13
Celebrity                       13
Pets                            13
Medical                         12
Music                           12
Finance                         10
Literature                      10
Sports                          10
Tech/Electronics                 9
Movie/TV                         8
Personal investment              8
Politics                         8
Music songs                      7
Car                              7
Food/Nutrition                   7
Philosophy                       6
Games                            6
Business                         5
Gaming                           5
Food/Drink                       5
Relationship                     5
Education/Car

In [42]:
# check for duplicates
df_topics[df_topics.duplicated()]

Unnamed: 0,generic_topic,domain


In [43]:
# check for nan
df_topics.isna().sum()

generic_topic    0
domain           0
dtype: int64

In [45]:
# save to json
df_topics.to_json('topics.json', orient='records')

## Generated personas

In [4]:
## get personas from jsonl
import pandas as pd
filepath="generated_personas.jsonl"

personas_df=pd.read_json(filepath, lines=True)

In [5]:
personas_df

Unnamed: 0,persona,topics,domain
0,$name$ moves through life with the precision o...,"[cleaning carpet, reflooring a room, World War I]",Health/Nutrition
1,$name$ moves through the world with a curiosit...,"[F. Scott Fitzgerald, movie cast members]",Politics
2,$name$ has always had a precise and analytical...,"[Gay Marriage, computers, Skin Cancer]",Finance
3,$name$ has always found solace in the unexpect...,"[Anne Sexton, Mountain Biking, Travel, candy]",Games
4,$name$ carries the spirit of curiosity and lea...,[Healthy foods],Education
...,...,...,...
788,$name$ has always been one to follow the rhyth...,"[Musical trends, Apple Company]",Health/Nutrition
789,$name$ walks through life with a profound resp...,[landscaping],Health/Nutrition
790,$name$ is known in their circle for a unique b...,"[earth's moon, Literature, potty training, Pur...",Clothing/Crocheting/knitting
791,"$name$ wakes up to the soft glow of dawn, thei...","[Home education, Stretching, electric cars, Va...",Sports


In [7]:
personas_df.head()

Unnamed: 0,persona,topics,domain
0,$name$ moves through life with the precision o...,"[cleaning carpet, reflooring a room, World War I]",Health/Nutrition
1,$name$ moves through the world with a curiosit...,"[F. Scott Fitzgerald, movie cast members]",Politics
2,$name$ has always had a precise and analytical...,"[Gay Marriage, computers, Skin Cancer]",Finance
3,$name$ has always found solace in the unexpect...,"[Anne Sexton, Mountain Biking, Travel, candy]",Games
4,$name$ carries the spirit of curiosity and lea...,[Healthy foods],Education
