In [10]:
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from helpers import utils
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = 'plotly_dark'
import re

dir_path = os.path.join(os.getcwd(), '..')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
column_names = ['id','movie_name','year','score','avaliations','duration','genres']
df = pd.read_csv(dir_path+'/datasets/imdb.txt', sep='	', names= column_names)

In [None]:
df.head()

In [75]:
# fixing duration format
df['duration_in_minutes'] = df['duration'].apply(lambda x: int(re.sub(r'\s?mins\.$', '', x)))

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Create subplots with 1 row and 4 columns
fig = make_subplots(rows=2, cols=2, subplot_titles=("Year", "Score", "Avaliation", "Duration"))

# Add histograms to subplots
variables = ["year", "score", "avaliations", "duration_in_minutes"]
for n, var in enumerate(variables): 
    row= 1
    col= n+1
    if col > 2:
        col= int(np.floor(col/2))
        row = 2

    fig.add_trace(px.histogram(df, x=var, opacity=0.9, color_discrete_sequence=utils.color_palette).data[0], row=row, col=col)
    fig.update_traces(bingroup='x'+str(n), row=row, col=col) # update

# Update layout
fig.update_layout(
    title_text='Data Distribuition',
    barmode='overlay',
    showlegend=False,
    width=900,
    height=650,
    margin= utils.tight_margin
)


# Show the figure
fig.show()

In [None]:
# Call the function with the example DataFrame
utils.correlation_heatmap(df, columns=["year", "score", "avaliations", "duration_in_minutes"], title='My Correlation Heatmap', figsize=(900, 500))

In [None]:
df

In [81]:
def adjust_duration(dur, mean):
    if dur == 0:
        return mean
    else:
        return dur

In [82]:
mean_duration = df['duration_in_minutes'].mean()
df['duration_in_minutes'] = df['duration_in_minutes'].apply(lambda x: adjust_duration(x, mean_duration))

In [None]:
# Split genres and apply One-Hot Encoding
df['genres_split'] = df['genres'].str.split('|')
df_norm = df.explode('genres_split')
df_norm = pd.get_dummies(df_norm, columns=['genres_split'], prefix='', prefix_sep='')

# Aggregate back to original rows
df_norm = df_norm.groupby('id').max().reset_index()

df_norm.head(2)


In [None]:
columns= ['year', 'score', 'avaliations', 'duration_in_minutes', 'Action', 
         'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family',
         'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical',
         'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport',
         'Adult', 'Adventure', 'Thriller', 'War', 'Western']

y_axis= [ 'Action', 'Adult', 'Adventure', 'Thriller', 'War', 'Western',
        'Animation', 'Biography', 'Comedy', 'Crime', 'Drama', 'Family',
        'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical',
        'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Sport',]

x_axis = ['year', 'score', 'avaliations', 'duration_in_minutes']

utils.correlation_heatmap(df_norm, columns=columns, y_axis=y_axis, x_axis=x_axis, figsize=(1000, 800))

In [None]:
dfPlot = df_norm.groupby('year')[['score','avaliations','id']].agg(dict(score='mean', avaliations='sum', id='count')).reset_index()

fig = px.bar(dfPlot, x='year',y='score', color='score', hover_data=['avaliations', 'id'], text=round(dfPlot['score'],1))
fig.update_traces(textposition='outside')
fig.update_yaxes(range=[5,8])

In [None]:
import plotly.graph_objects as go

def combine_px_figures(fig1, fig2, use_secondary_y=False, title="Combined Chart", figsize=(800, 600)):
    """
    Combines two Plotly Express figures into a single chart with an option for a secondary y-axis.

    Parameters:
    - fig1 (go.Figure): First Plotly Express figure.
    - fig2 (go.Figure): Second Plotly Express figure.
    - use_secondary_y (bool): Whether to use a secondary y-axis for fig2. Default is False.
    - title (str): Title of the combined chart.
    - figsize (tuple): Width and height of the figure (in pixels).

    Returns:
    - go.Figure: Combined Plotly figure.
    """
    # Create a new figure
    combined_fig = go.Figure()

    # Add traces from both figures
    for trace in fig1.data:
        combined_fig.add_trace(trace)
    for trace in fig2.data:
        combined_fig.add_trace(trace.update(yaxis="y2") if use_secondary_y else trace)

    # Extract axis labels from the original figures
    xaxis_title = fig1.layout.xaxis.title.text or "X-axis"
    yaxis_title = fig1.layout.yaxis.title.text or "Y-axis"
    yaxis2_title = fig2.layout.yaxis.title.text or "Y2-axis"

    # Update layout with optional secondary y-axis
    combined_fig.update_layout(
        title=dict(text=title, x=0.5, font=dict(size=18)),
        width=figsize[0],
        height=figsize[1],
        margin=dict(l=10, r=10, t=50, b=10),
        xaxis=dict(title=xaxis_title),
        yaxis=dict(title=yaxis_title),
    )

    if use_secondary_y:
        combined_fig.update_layout(
            yaxis2=dict(
                title=yaxis2_title,
                overlaying='y',
                side='right',
                showgrid=False
            )
        )

    combined_fig.show()
    return combined_fig

# Example Usage with Plotly Express:
import plotly.express as px

# Create two Plotly Express figures
fig1 = px.line(x=[1, 2, 3], y=[4, 5, 6], labels={'x': 'Time', 'y': 'Value 1'}, title="Line Chart", color_discrete_sequence=utils.color_palette[3:])
fig2 = px.bar(x=[1, 2, 3], y=[7, 8, 9], labels={'x': 'Time', 'y': 'Value 2'}, title="Bar Chart", color_discrete_sequence=utils.color_palette[1:])

# Combine them into one figure with a secondary y-axis
combined_fig = combine_px_figures(fig2, fig1, use_secondary_y=True, title="Line and Bar Chart")
