In [2]:
import pandas as pd

In [3]:
# Load relevant CSV files
actors = pd.read_csv('data/actors.csv')
appearances = pd.read_csv('data/appearances.csv')
episodes = pd.read_csv('data/episodes.csv')
tenure = pd.read_csv('data/tenure.csv')

In [4]:
# Filter for only cast members
cast_members = actors[actors['type'] == 'cast']

# Merge tenure with actors to get full names
combined_data = tenure.merge(cast_members, on='aid', how='left')

In [5]:
# Prepare the summary DataFrame
summary = combined_data[['aid', 'n_seasons', 'n_episodes']]
summary.columns = ['aid', 'Number_of_Seasons', 'Number_of_Episodes']

In [None]:
# Extract first and last names
summary['First_Name'] = summary['aid'].str.split(' ').str[0]  # Extract first name
summary['Last_Name'] = summary['aid'].str.split(' ').str[1]   # Extract last name

summary.head()

In [7]:
# Drop unnecessary columns and arrange the DataFrame
summary = summary[['aid', 'First_Name', 'Last_Name', 'Number_of_Seasons']]

In [None]:
summary.head()

In [None]:
# Merge appearances with episodes to get aired date for each appearance
merged_data = appearances.merge(episodes, on='epid', how='left')

merged_data.head()

In [10]:
# Convert 'aired' column to datetime (format: 'April 10, 2021')
merged_data['aired'] = pd.to_datetime(merged_data['aired'], format='%B %d, %Y')

In [11]:
# Extract the year from the 'aired' column
merged_data['Year'] = merged_data['aired'].dt.year

In [None]:
merged_data.head()

In [None]:
# Merge with tenure data to get actor information
combined_data = merged_data.merge(tenure, on='aid', how='left')

# Filter for only cast members
combined_data = combined_data[combined_data['capacity'] == 'cast']

combined_data.head()

In [None]:
# Calculate the first and last year for each actor
first_last_years = combined_data.groupby('aid').agg(
    First_Year=('Year', 'min'),
    Last_Year=('Year', 'max'),
).reset_index()

first_last_years

In [None]:
# Add number of seasons for each actor from tenure
number_of_seasons = combined_data[['aid', 'n_seasons']].drop_duplicates()
first_last_years = first_last_years.merge(number_of_seasons, on='aid', how='left')

first_last_years

In [None]:
# Merge this back with actors to get full names
final_summary = first_last_years.merge(summary, on='aid', how='left')

final_summary.head()

In [17]:
# Selecting relevant columns
final_summary = final_summary[['aid', 'First_Name', 'Last_Name', 'First_Year', 'Last_Year', 'n_seasons']]

In [18]:
# Remove rows with NaN values
final_summary = final_summary.dropna()

In [None]:
# Inspecting the final DataFrame
print(final_summary.head())

In [20]:
# Save to CSV for Tableau
final_summary.to_csv('snl_cast_years_summary.csv', index=False)

In [None]:
merged_data.head()

In [32]:
# Rename a column
merged_data['sid'] = merged_data['sid_x']


In [34]:
# Select relevant columns
merged_data = merged_data[['aid', 'tid', 'capacity', 'epid', 'epno', 'sid', 'Year']]

In [None]:
merged_data.head()

In [36]:
merged_data.to_csv('cast_summary.csv', index=False)