## Libraries

In [50]:
# import libraries
from fileinput import filename
from json import load
from typing import List
import string
import pandas as pd
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import random
from datetime import datetime, timedelta
import numbers
from recommender_system_simple import cleaner


## (ABC) Dataset

In [51]:
# Load data
pathABC = 'data/ABC/'
fileNamesABC = [
  'arts.pkl', 'comedy.pkl', 'documentary.pkl', 'drama.pkl', 'education.pkl', 'family.pkl', 'kids.pkl', 'movies.pkl', 'news.pkl', 'panel-discussion.pkl'
]

def loadMultiplePKLsIntoOneDF(fileNames: List[str], path: str): 
  # Load all PKL files into one dataframe
  dfs = []
  for file in fileNames:
    df = pd.read_pickle(pathABC + file)
    dfs.append(df)
  df = pd.concat(dfs, ignore_index=True)
  return df

# create dataframe from all files
dfABC = loadMultiplePKLsIntoOneDF(fileNamesABC, pathABC)

# change category column name to genre
dfABC.rename(columns={'category' : 'genre'}, inplace=True)

# clean dataframe
dfABC = cleaner(dfABC)

# save combined dataframe to a new file
dfABC.to_pickle('data/ABC/combined.pkl')


len big_df step 0 13885
len big_df step 1 1014
len big_df step 2 1014
len big_df step 3 1014
len big_df step 4 1014


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df['tags'] = big_df['tags'].apply(correct_and_lower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df['tags2'] = big_df['tags2'].apply(correct_and_lower)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  big_df['deduplicated_description'] = big_df.apply(lambda x: combine_and_deduplicate(x['de

In [54]:
# get list of dataframe columns
dfABC.to_pickle('data/ABC/combined.pkl')

dfABC.columns

Index(['genre', 'title', 'series', 'episode_name', 'description',
       'description2', 'tags', 'image', 'more', 'tags2', 'publication_date',
       'rating', 'duration_sec', 'deduplicated_description',
       'deduplicated_tags'],
      dtype='object')

## Data exploration

In [11]:
# show head of dataframe of ABC dataset
dfABC[['genre', 'title', 'description', 'rating']].head()

Unnamed: 0,genre,title,description,rating
0,Comedy,Bangarra's World,Join the dancers and creatives during Bangarra...,G
1,Comedy,Mission Songs Project,Jessie Lloyd's Mission Songs Project revives s...,G
2,Comedy,The Story of Film: An Odyssey,A worldwide guided tour of the greatest movies...,M
3,Comedy,Ballet Now,This film captures the creative process behind...,M
4,Comedy,Julia Zemiro's Home Delivery,Julia Zemiro invites notable personalities to ...,G


In [12]:
# dataframe dimensions
dfABC.shape

(13885, 13)

In [10]:
# descriptive statistics
dfABC.describe()

Unnamed: 0,category,title,series,episode_name,description,description2,tags,image,more,tags2,publication_date,rating,duration_sec
count,13885,13885,13885,13885,13885,13885,13885,13885,13885,13885,13885,13885,13885
unique,9,1014,658,9400,1004,10242,175,11342,446,1712,6020,5,2528
top,Kids,Australian Story,No data found,Series 1 Episode 1,"Putting the 'real' back into reality TV, the a...",Bluey is an inexhaustible six-year-old Blue He...,[ABC Kids],https://cdn.iview.abc.net.au/thumbs/i/nu/NU222...,No more information found,[abc4kids],2021-06-11 07:00:00,G,420
freq,4298,284,599,149,284,50,3224,44,9355,425,280,8906,697


In [16]:
# distribution of categories
dfABC['genre'].value_counts()

category
Kids                4298
Education           2529
Family              2056
Drama               1175
News                1173
Documentary         1122
Comedy               912
Panel Discussion     477
Movies               143
Name: count, dtype: int64

In [None]:
# wordcloud of description column
wordcloud = WordCloud().generate(' '.join(dfABC['description']))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

## Create Users

In [56]:
# list of genres
# genres = [genre.lower() for genre in dfABC['genre'].unique()] # type: ignore
genres = dfABC['genre'].unique()

# define personas based on genres
myPersonas = {
  'comedy lover': {genre: 0.6 if genre == 'Comedy' else 0.2 for genre in genres},
  'young families': {genre: 0.8 if genre in ['Family','Kids'] else 0.3 for genre in genres},
  'information seeker': {genre: 0.9 if genre in ['News','Documentary', 'Panel Discussion', 'Education'] else 0.1 for genre in genres},
  'drama hater': {genre: 0.1 if genre == 'Drama' else 0.5 for genre in genres},
  'movie buff': {genre: 0.7 if genre == 'Movies' else 0.1 for genre in genres},
}

In [57]:
# functions toc reate user data based on the ABC show dataset with use of persona preferences

# set seed for all the randomisation functions
from matplotlib.style import available
from numpy import minimum


random.seed(42)


def assign_movie_languages(movies):
    movies['available_languages'] = "EN"
    return movies

movies_with_languages = assign_movie_languages(dfABC)

def selectShowForPersona(persona, selected_movies):
  preferences = myPersonas[persona]

  all_shows = list(movies_with_languages.to_dict(orient='records'))
  available_shows = [show for show in all_shows if show not in selected_movies]

  weights = [preferences[show['genre']] for show in available_shows]
  selected_movie = random.choices(available_shows, weights=weights, k=1)[0]
  return selected_movie

def randomWatchTimePercentage():
    return random.randint(30, 100)

def randomPersonaPercentageDistribution(personas):
    numberOfPersonas = len(personas)
    personasKeys = list(personas.keys())
    personaDistributionPercentage = {}
    
    # create number of random percentages for each persona
    total = 100
    minimum = 10
    for i in range(numberOfPersonas):
        percentage = random.randint(1, total - (numberOfPersonas - i - 1))
        personaDistributionPercentage[personasKeys[i]] = percentage
        total -= percentage
    
    return personaDistributionPercentage


distribution = {
    'comedy lover': 20,
    'young families': 30,
    'information seeker': 15,
    'drama hater': 10,
    'movie buff': 25,
}


def generateUsersData(numUsers=100): #set number of users here
    personaDistribution = distribution #randomPersonaPercentageDistribution(myPersonas)
    
    userData = []

    for persona, percent in personaDistribution.items():
        numPersonaUsers = int((percent / 100.0) * numUsers)
        
        for user_index in range(numPersonaUsers):
            #randomize the number of interactions (i.e. how many movies/items a user interacted with)
            numInteractions = random.randint(40, 200)  
            user_watched_movies = []
            for _ in range(numInteractions):
                movie = selectShowForPersona(persona, user_watched_movies)
                
                # add value to array so it doens't get selected again
                user_watched_movies.append(movie)
                
                if movie is None:  # Check if movie is None
                    continue  # Skip this iteration of the loop
                
                userInteraction = {
                    "user_id": f"user_{persona}_{user_index+1}",
                    "genre": movie["genre"],
                    "title": movie["title"],
                    "description": movie["description"],
                    "age_rating" : movie["rating"],
                    "duration_sec" : movie["duration_sec"],
                    "persona": persona,
                    "rating": np.nan if random.random() > 0.7 else random.randint(0, 5),
                    "watched_percentage": randomWatchTimePercentage()
                }
                userData.append(userInteraction)

    return pd.DataFrame(userData)


#generate and display the synthetic user data
userData = generateUsersData() 

# save user data to a new file
userData.to_pickle('data/ABC/userData.pkl')


In [58]:
combined_data = pd.read_pickle('data/ABC/combined.pkl')
# sort df on title
combined_data.sort_values(by='title', inplace=True, ignore_index=True)
combined_data.head(10)

Unnamed: 0,genre,title,series,episode_name,description,description2,tags,image,more,tags2,publication_date,rating,duration_sec,deduplicated_description,deduplicated_tags
0,Drama,100 Bloody Acres,No data found,100 Bloody Acres,Organic fertilizer producers and brothers are ...,Organic fertilizer producers and brothers are ...,"[abc tv, abc tv plus, comedy, drama, movies]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW297...,"Director Colin Caires, Cameron Cairnes","[abc1, abc2, australia, comedy, drama, film, h...",2021-06-11 07:00:00,MA,5207,brothers comes When problem. supply the radica...,australia ns:relieve-boredom drama ns:be-enter...
1,Drama,13 Assassins,No data found,13 Assassins,When the sadistic excesses of Lord Matsudaira ...,When the sadistic excesses of Lord Matsudaira ...,"[abc tv, abc tv plus, drama, movies]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW284...,Director Takashi Miike,"[abc1, abc2, action, drama, film, period-drama...",2021-06-01 07:00:00,MA,7157,When the Matsudaira action samurai deadly grou...,drama action ns:be-entertained abc tv samurai ...
2,Education,199 Little Heroes,199 Little Heroes,Series 1 Switzerland,The journey to school is a very special type o...,"Enjo lives in Quinten, a forest glade in the m...","[abc me, education]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW073...,No more information found,"[abc3, education, environment-day, primary-hum...",2021-06-01 06:20:00,G,311,shared the very child's new experience What li...,environment-day abc3 abc me education primary-...
3,Drama,30 Days Of Night,No data found,30 Days Of Night,After an Alaskan town is plunged into darkness...,After an Alaskan town is plunged into darkness...,"[abc tv, abc tv plus, drama, movies]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW285...,Director David Slade,"[abc1, abc2, horror, thriller, drama, film, fe...",2021-06-01 07:00:00,MA,6471,townsfolk an the maniacal few band help surviv...,drama ns:be-entertained abc tv edge-of-your-se...
4,News,7.30,7.30,Monday 14/3/2022,Leigh Sales presents Australia's leading night...,The devastation caused by the recent floods in...,"[abc tv, abc news]",https://cdn.iview.abc.net.au/thumbs/i/nc/NC220...,No more information found,"[news24, abc1, news, current-affairs, abc-fave...",2022-03-14 20:00:00,No data found,1906,"issues, caused analysis program, the Tingle. M...",current-affairs 730 news24 abc1 abc tv abc-fav...
5,News,7.30 Mark Humphries Satire,7.30 Mark Humphries Satire,Behind the scenes of the India travel ban,Satirist Mark Humphries brings his unique pers...,Satirist Mark Humphries goes inside the Health...,[abc news],https://cdn.iview.abc.net.au/thumbs/i/nn/NN211...,No more information found,"[news24, news, satire]",2021-05-11 14:00:00,No data found,145,inside India. or the Mark would government's a...,satire news news24 abc news
6,News,7.30 Special: The Shane Warne Interview,No data found,7.30 Special: The Shane Warne Interview,Shane Warne has died of a suspected heart atta...,Shane Warne has died of a suspected heart atta...,"[abc news, sport]",https://cdn.iview.abc.net.au/thumbs/i/ns/NS188...,No more information found,"[news, news24, sport, shane-warne, cricket, in...",2022-03-05 02:15:00,No data found,1470,2018 the revisit aged life died Shane Thailand...,news24 australian revealing sport biography in...
7,News,7.30: The Interviews,7.30: The Interviews,Dave Grohl: The Extended Interview,Revisit some of 7.30's most compelling convers...,Musician Dave Grohl sits down with 7.30's Leig...,[abc news],https://cdn.iview.abc.net.au/thumbs/i/ns/NS224...,No more information found,"[news, news24]",2022-03-08 20:00:00,No data found,1030,"stage the politicians, Musician Nirvana Austra...",news24 news abc news
8,Family,72 Cutest Animals,72 Cutest Animals,Series 1 Episode 6 Footpads and Flippers,The animal kingdom continually astounds us wit...,Cute is a notion that can mean many things to ...,"[abc me, family]",https://cdn.iview.abc.net.au/thumbs/i/zw/ZW243...,No more information found,"[abc3, family-viewing-abcme, family-viewing]",2022-03-05 15:03:06,G,1616,How we an the rankings seventy-two creatures o...,family-viewing abc3 abc me family-viewing-abcm...
9,News,9/11 Stories,No data found,9/11 Stories,"20 Years after the 9/11 Attacks, John Barron s...","20 Years after the 9/11 Attacks, John Barron s...","[abc news, abc tv]",https://cdn.iview.abc.net.au/thumbs/i/ns/NS215...,No more information found,"[news, news24, current-affairs, history, histo...",2021-09-11 20:00:11,No data found,3510,"speaks the Barron were fateful Hear love, Atta...",somber current-affairs news24 abc1 new-york hi...


In [62]:
# print userdata
userData = pd.read_pickle('data/ABC/userData.pkl')
# print(userData.shape)

# give the different options of the user_id column from the userData dataframe
# print(userData['user_id'].unique())
dfABC.shape


(1014, 16)