# Emotions Episodes Votes

In [11]:
import pandas as pd

file_path = './raw_data/emotions_episode_votes.csv'
file_path_new = './data/emotions_episode_votes_modified.csv'

# Attempt to correctly parse the CSV file by detecting the delimiter automatically
df_new_corrected = pd.read_csv(file_path, sep=None, engine='python')

# Convert the column names to lowercase
df_new_corrected.columns = map(str.lower, df_new_corrected.columns)

# Retain only the specified columns
columns_to_keep_corrected = ['episode_id', 'emotion_id', 'user_id', 'created']
df_new_filtered = df_new_corrected[columns_to_keep_corrected]

# Display the modified dataframe
df_new_filtered.head()

# # Save the modified dataframe to a new CSV file
# df_new_filtered.to_csv(file_path_new, index=False)


Unnamed: 0,episode_id,emotion_id,user_id,created
0,9279985,33,22659137,2023-09-04 12:31:12 +0000 UTC
1,8355786,34,22659137,2023-01-28 21:45:44 +0000 UTC
2,8987962,31,22659137,2023-05-10 19:18:18 +0000 UTC
3,9333705,30,22659137,2022-10-08 12:51:28 +0000 UTC
4,9138258,30,22659137,2022-11-17 11:56:07 +0000 UTC


# Episode Comments

In [19]:
import pandas as pd

file_path = './raw_data/episode_comment.csv'
file_path_new = './data/episode_comment_modified.csv'

# Load the dataset
df = pd.read_csv(file_path, delimiter=',')

# Filter the rows where depth is greater than 0 - we don't want the replies to the comments
df_filtered = df[df['depth'] < 1]

# Select only specified columns
df_filtered = df_filtered[['episode_id', 'tv_show_name', 'episode_season_number', 'episode_number', 'user_id', 'comment', 'nb_likes', 'created_at', 'updated_at']]

df_filtered.head()

# Save the modified dataframe to a new CSV file
df_filtered.to_csv(file_path_new, index=False)

# Followed TV Show preprocessing

In [6]:
import pandas as pd

file_path = './raw_data/followed_tv_show.csv'
file_path_new = './data/followed_tv_show_modified.csv'

# Attempt to correctly parse the CSV file by detecting the delimiter automatically
df_new_corrected = pd.read_csv(file_path, sep=None, engine='python')

# Display the corrected dataframe structure to verify the column names and data
df_new_corrected.head()

# Retain only the specified columns
columns_to_keep_corrected = ['tv_show_name', 'tv_show_id', 'created_at', 'active', 'archived', 'user_id']
df_new_filtered = df_new_corrected[columns_to_keep_corrected]

# Display the modified dataframe
df_new_filtered.head()

# Save the modified dataframe to a new CSV file
df_new_filtered.to_csv(file_path_new, index=False)


# Rating Episode preprocessing

In [14]:
import pandas as pd

# Define a mapping from order values to note values
order_mapping = {'1': 1, '27': 2, '28': 3, '29': 4, '3': 5}

# Function to convert order and RATING_ID to note
def convert_to_note(row):
    # Split the 'order' string into a list of strings, then convert to a list of integers
    order_values = list(map(int, row['order'].split(',')))
    # Get the index of RATING_ID in order_values, then use this index to find the corresponding note value
    try:
        rating_index = order_values.index(row['RATING_ID'])
        return order_mapping[str(order_values[rating_index])]
    except ValueError:
        # In case the RATING_ID is not found in the order list, return NaN or some error indicator
        return float('nan')

file_path = './raw_data/ratings_episode_votes.csv'
df = pd.read_csv(file_path, sep=None, engine='python')

# Apply the conversion function to each row
df['note'] = df.apply(convert_to_note, axis=1)

# Drop the specified columns
columns_to_drop = ['order', 'RATING_ID', 'VOTE_KEY', 'IS_DELETED', 'DB_UPDATE_TS', 'set']
df.drop(columns=columns_to_drop, inplace=True)
df.columns = map(str.lower, df.columns)

# Display the modified dataframe
df.head()


# Save the modified dataframe to a new CSV file
output_file_path = './data/ratings_episode_votes_modified.csv'
df.to_csv(output_file_path, index=False)


# Seen Episodes

In [1]:
import pandas as pd

# Load the datasets
rewatched_episode_df = pd.read_csv('./raw_data/rewatched_episode.csv')
seen_episode_df = pd.read_csv('./raw_data/seen_episode.csv')

# Drop updated_at of seen_episode_df
seen_episode_df.drop(columns=['updated_at', 'tweet_id'], inplace=True)
# Drop created_at of rewatched_episode_df
rewatched_episode_df.drop(columns=['created_at'], inplace=True)

# Merge the datasets on specified columns
merge_columns = ['tv_show_name', 'episode_season_number', 'episode_number', 'user_id', 'episode_id']
final_df = pd.merge(seen_episode_df, rewatched_episode_df[merge_columns + ['cpt']], 
                    on=merge_columns, 
                    how='left')

# Replace NaN in rewatched_count with 0
final_df['rewatched_count'] = final_df['cpt'].fillna(0).astype(int)
final_df.drop(columns=['cpt'], inplace=True)  # Drop the 'cpt' column as it's no longer needed

# Remove the rows where tv_show_name is NaN or empty
final_df = final_df[final_df['tv_show_name'].notna()]

# # Select and rename relevant columns for the final dataset
# final_dataset = final_df[['updated_at', 'tv_show_name', 'episode_season_number', 'episode_number', 
#                           'user_id', 'episode_id', 'created_at', 'rewatched_count']]

# Display the first few rows of the final dataset
final_df.head()

# Save the final dataset to a new CSV file
file_path_new = './data/seen_episode_modified.csv'
final_df.to_csv(file_path_new, index=False)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
