# Visualizations

In this file, I make all of the visualizations needed for the project. I make modified presence charts, true presence charts, and XX.

In [516]:
import altair as alt
import pandas as pd
from itertools import combinations

In [517]:
# add colors scheme
colors = ['#B2FD52','#3C91E6','#000000','#0C0A3E','#989788',
          '#8A2BE2','#1B4D3E','#9B2C2C','#C4A000','#A0A5A9']

In [518]:
# open complete csv
complete = pd.read_csv('https://scmcqueen.github.io/StarTrekScriptData/complete_data.csv')
complete.columns = ['index', 'character', 'quote', 'scene', 'location', 'view',
       'episode', 'date', 'series', 'file']

complete['character'] = complete['character'].apply(lambda text: " ".join(str(text).split()))
complete['quote']=complete['quote'].apply(lambda text: " ".join(text.split()))

In [519]:
# split the data
ds9 = complete[complete.series == 'Deep Space Nine']
tng= complete[complete.series == 'The Next Generation']
# put into a dict for easy comprehension!
data = {'tng': {
    'data':tng
    },
        'ds9': {
            'data':ds9
        }
        }

In [520]:
# get the counts of lines per character
for show in data.keys():
    df = data[show]['data'].copy()
    # get counts of quotes
    df = df.groupby(['file','episode','character']).count()['quote'].reset_index()
    # get the pct of quotes per episode
    df['pct_q_ep']=df['quote']/df.groupby('episode')['quote'].transform('sum')
    # make the character nicer
    df['character'] = df['character'].apply(lambda x: x[0]+x[1:].lower())
    # get ep id
    unique_eps = df['episode'].unique()
    mapping = {val: i for i, val in enumerate(unique_eps)}
    df['ep_id'] = df['episode'].map(mapping)

    # get max pct
    max_pct = df.pct_q_ep.max()

    # get the top 10 characters
    top_10 = list(df.groupby('character').sum()['quote'].reset_index().sort_values('quote',ascending=False)[:10]['character'])

    # filter the df by top 10
    df_filtered = df[df.character.isin(top_10)]
    # rename columns
    df_filtered.columns=['file', 'Episode', 'character', 'Number of Lines', 'pct_q_ep', 'ep_id']

    # add new info to dict
    data[show]['top10']=top_10
    data[show]['filtered_data']=df_filtered
    data[show]['max']=max_pct

In [529]:
alt.Chart(data['tng']['filtered_data']).transform_calculate(
    x2 = 'datum.pct_q_ep * -1'
).mark_rect(cornerRadius=3
).encode(
    #x =  alt.X('character:N'),
    y= alt.Y('ep_id:N',axis=None),
    x=alt.X('pct_q_ep', type="quantitative",axis=None,scale=alt.Scale(domain=[-max,max])),
    x2=alt.X2('x2'),
    order=alt.Order('ep_id',sort='descending'),
    color = alt.Color('character:N',legend=None,scale=alt.Scale(range=colors)),
    tooltip=['character','Episode:N','Number of Lines']
).properties(width=70,height=900).facet(
    column=alt.Column('character:N',sort=data['tng']['top10'],title=None),
    padding=0,
    spacing = 0,
).configure_view(stroke=None).resolve_axis(y='shared').properties(title='Star Trek: The Next Generation')

In [528]:
alt.Chart(data['ds9']['filtered_data']).transform_calculate(
    x2 = 'datum.pct_q_ep * -1'
).mark_rect(cornerRadius=3
).encode(
    #x =  alt.X('character:N'),
    y= alt.Y('ep_id:N',axis=None),
    x=alt.X('pct_q_ep', type="quantitative",axis=None,scale=alt.Scale(domain=[-max,max])),
    x2=alt.X2('x2'),
    order=alt.Order('ep_id',sort='descending'),
    color = alt.Color('character:N',legend=None,scale=alt.Scale(range=colors)),
    tooltip=['character','Episode:N','Number of Lines']
).properties(width=70,height=900).facet(
    column=alt.Column('character:N',sort=data['ds9']['top10'],title=None),
    padding=0,
    spacing = 0,
).configure_view(stroke=None).resolve_axis(y='shared').properties(title='Star Trek: Deep Space Nine')

## True Presence Charts

Above, I made modified presence charts that represent the entire television series. 

In [523]:
df =data['tng']['data']
# filter data for episode
eaf = df[df.episode=='Encounter at Farpoint']
# get top 10 characters
eaf['character'] = eaf['character'].apply(lambda x: x[0]+x[1:].lower())
# top_10 = list(eaf.groupby('character').sum()['quote'].reset_index().sort_values('quote',ascending=False)[:10]['character'])

# # filter the df by top 10
eaf = eaf[eaf.character.isin(data['tng']['top10'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eaf['character'] = eaf['character'].apply(lambda x: x[0]+x[1:].lower())


In [524]:
combination_outputs = []
# output is a list [scene,char1,char2]
char_outputs = []
# this is a list of [scene, char]

for sc in eaf['scene']:
    unique_char = eaf[eaf.scene==sc]['character'].unique()
    for char in unique_char:
        if [sc,char] not in char_outputs:
            char_outputs.append([sc,char])
    for comb in combinations(unique_char,2):
        if [sc]+list(comb) not in combination_outputs:
            combination_outputs.append([sc]+list(comb))

In [525]:
def remove_duplicates_maintain_order(li:list)-> list:
    '''Get a list without duplicates while maintaining order'''
    output = list(dict.fromkeys(li))
    output.reverse()
    return output  # Or [*dict.fromkeys(items)] if you prefer

scene_maps = remove_duplicates_maintain_order([x[0] for x in char_outputs])

In [526]:
# make df now
combo_df = pd.DataFrame(combination_outputs,columns=['scene','char1','char2'])
char_df = pd.DataFrame(char_outputs,columns=['scene','character'])

#                   map index to value
top_10=data['tng']['top10'] # make this easier to reference

# char data
char_df['x']=char_df['character'].apply(lambda x: top_10.index(x))
char_df['left']=char_df['x'].apply(lambda x: x-.45)
char_df['right']=char_df['x'].apply(lambda x: x+.45)
char_df['left2']=char_df['x'].apply(lambda x: x-.1)
char_df['right2']=char_df['x'].apply(lambda x: x+.1)
char_df['y1']=char_df['scene'].apply(lambda x: scene_maps.index(x)+.125)
char_df['y2']=char_df['scene'].apply(lambda x: scene_maps.index(x)+.875)

# combination data
combo_df['x1']=combo_df['char1'].apply(lambda x: top_10.index(x))
combo_df['x2']=combo_df['char2'].apply(lambda x: top_10.index(x))
combo_df['y1']=combo_df['scene'].apply(lambda x: scene_maps.index(x)+.375)
combo_df['y2']=combo_df['scene'].apply(lambda x: scene_maps.index(x)+.625)

In [527]:
axis_labels = (
    f"""datum.label == 0 ? '{top_10[0]}'
    : datum.label == 1 ? '{top_10[1]}'
    : datum.label == 2 ? '{top_10[2]}'
    : datum.label == 3 ? '{top_10[3]}'
    : datum.label == 4 ? '{top_10[4]}'
    : datum.label == 5 ? '{top_10[5]}'
    : datum.label == 6 ? '{top_10[6]}'
    : datum.label == 7 ? '{top_10[7]}'
    : datum.label == 8 ? '{top_10[8]}'
    : datum.label == 9 ? '{top_10[9]}'
    : ' '
    """
)

base_line = alt.Chart(char_df).mark_rect(cornerRadius=3).encode(
    x=alt.X('left2:Q',axis=alt.Axis(labelExpr=axis_labels)), # add info on top 10
    x2 = alt.X2('right2:Q'),
    color=alt.Color('character:N',scale=alt.Scale(range=colors)) #,axis=alt.Axis(labelExpr=color_labels))
)

horizontal_line = alt.Chart(combo_df).mark_rect().encode(
    x = alt.X('x1:Q'),
    x2=alt.X2('x2:Q'),
    y=alt.Y('y1:Q',axis=None,scale=alt.Scale(domain=[0,len(scene_maps)+1])),
    y2=alt.Y2('y2:Q'),
    color=alt.Color('char1',legend=alt.Legend(title='Character'),scale=alt.Scale(range=colors))
)

scene_presence = alt.Chart(char_df).mark_rect(cornerRadius=3).encode(
    # y=alt.Y('scene',axis=None),
    y=alt.Y('y1:Q',axis=None,scale=alt.Scale(domain=[0,len(scene_maps)])),
    y2=alt.Y2('y2:Q'),
    x=alt.X('left:Q',axis=alt.Axis(labelExpr=axis_labels,ticks=False,domain=False),title=None), # add info on top 10
    x2 = alt.X2('right:Q'),
    tooltip=['scene:N','character:N'],
    color=alt.Color('character:N',scale=alt.Scale(range=colors)) #,axis=alt.Axis(labelExpr=color_labels))
)

alt.layer(horizontal_line,base_line,scene_presence).properties(title='Encounter at Farpoint',height=800,width=450).configure_axis(
    grid=False
).configure_view(
    stroke=None
)