In [1]:
import time

# Start timing
start_time = time.time()

In [2]:
import math
import os
import sys
import warnings

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import pearsonr
from datetime import datetime
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "_0_Constants_and_Utils"))


from database_utils import form_connection_params, get_dataframe_from_query
from viz_constants import (
    COMPANY_ID_TO_NAME,
    DTYPES_CONVERSATIONS,
    DTYPES_CONVERSATIONS_CATEGORY,
    DTYPES_USERS,
    QUERY_CONVERSATIONS_CATEGORY,
    QUERY_USERS,
)

from viz_helpers import get_full_language_name, get_country_name

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)


In [3]:
start_date = "2000-06-06"
end_date = "2030-02-02"

In [4]:
local = True
connection_params = form_connection_params(local, True)

start_date_sql = datetime.strptime(start_date, '%Y-%m-%d').strftime('%Y-%m-%d')
end_date_sql = datetime.strptime(end_date, '%Y-%m-%d').strftime('%Y-%m-%d')

In [5]:
df_users = get_dataframe_from_query(QUERY_USERS, connection_params, local, DTYPES_USERS, "user_id", parse_dates=["creation_time"])
df_users.size

16414937

In [6]:
query_tweets_vis =f"""
Select
    tweet_id, user_id, lang, creation_time, country_code,
    favorite_count, retweet_count, possibly_sensitive,
    reply_count, quote_count, sentiment_score
from Tweets
where tweet_id in (select tweet_id from Conversations)
AND creation_time BETWEEN '{start_date_sql}' AND '{end_date_sql}';
"""


dtypes_tweets_vis = {
    "tweet_id": "object",
    "user_id": "object",
    "lang": "category",
    "creation_time": "datetime64[ns]",
    "country_code": "category",
    "favorite_count": "int32",
    "retweet_count": "int32",
    "possibly_sensitive": "bool",
    "reply_count": "int32",
    "quote_count": "int32",
    "sentiment_score": "float32"
}
df_tweets = get_dataframe_from_query(query_tweets_vis, connection_params, local, dtypes_tweets_vis, "tweet_id", parse_dates=["creation_time"])
df_tweets["lang"] = df_tweets["lang"].apply(get_full_language_name)
df_tweets["country_code"] = df_tweets["country_code"].apply(get_country_name)
df_tweets.size


10079400

In [7]:
df_tweets_and_users = df_tweets.rename(columns={'creation_time': 'tweet_creation_time'})\
    .merge(df_users.rename(columns={'creation_time': 'user_creation_time'}),
           left_on='user_id', right_index=True, how='inner')
df_tweets_and_users.size

17134980

In [8]:
QUERY_CONVERSATIONS = f"""
SELECT c.*
FROM Conversations c
JOIN Tweets t ON c.tweet_id = t.tweet_id
WHERE t.creation_time BETWEEN '{start_date_sql}' AND '{end_date_sql}'
AND NOT EXISTS (
    SELECT 1
    FROM Tweets t2
    JOIN Conversations c2 ON t2.tweet_id = c2.tweet_id
    WHERE c2.conversation_id = c.conversation_id
    AND t2.creation_time NOT BETWEEN '{start_date_sql}' AND '{end_date_sql}'
)
"""

df_conversations = get_dataframe_from_query(QUERY_CONVERSATIONS, connection_params, local, DTYPES_CONVERSATIONS, index_col=["conversation_id", "tweet_order"])
df_conversations.size

1259531

In [9]:
# ids_string = ','.join(map(str, df_conversations.reset_index()["conversation_id"].tolist()))
# QUERY_CONVERSATIONS_CATEGORY = f"""
# SELECT *
# FROM ConversationCategory
# WHERE conversation_id IN ({ids_string})
# """
df_conversations_category = get_dataframe_from_query(QUERY_CONVERSATIONS_CATEGORY, connection_params, local, DTYPES_CONVERSATIONS_CATEGORY, "conversation_id")
df_conversations_category.size

458724

In [10]:
df_all = pd.merge(df_conversations, df_tweets_and_users, left_on='tweet_id', right_index=True, how='inner')
df_all['airline_name'] = df_all['user_id'].map(COMPANY_ID_TO_NAME)
df_all['user_type'] = np.where(df_all['airline_name'].isna(), 'User', 'Airline')
df_all.size

25190620

In [11]:
df_user_sentiment = pd.DataFrame()

df_conversations_category["conversation_length"] = df_all.groupby("conversation_id").size()
df_conversations_category["conversation_start_time"] = df_all.groupby("conversation_id").first()["tweet_creation_time"]
df_conversations_category["conversation_end_time"] = df_all.groupby("conversation_id").last()["tweet_creation_time"]

user_tweets = df_all[df_all['user_type'] == 'User'].groupby("conversation_id")
df_conversations_category["user_sentiment_evolution"] = user_tweets.first()["sentiment_score"] - user_tweets.last()["sentiment_score"]
df_conversations_category["user_sentiment_min"] = user_tweets['sentiment_score'].min()
df_conversations_category["user_sentiment_max"] = user_tweets['sentiment_score'].max()
df_conversations_category["user_sentiment_mean"] = user_tweets['sentiment_score'].mean()
airline_tweets = df_all[df_all['user_type'] == 'Airline'].groupby("conversation_id")
df_conversations_category["airline_name"] = airline_tweets.first()["airline_name"]
df_conversations_category["airline_sentiment_evolution"] = airline_tweets.first()["sentiment_score"] - airline_tweets.last()["sentiment_score"]
df_conversations_category["airline_sentiment_min"] = airline_tweets['sentiment_score'].min()
df_conversations_category["airline_sentiment_max"] = airline_tweets['sentiment_score'].max()
df_conversations_category["airline_sentiment_mean"] = airline_tweets['sentiment_score'].mean()

In [12]:
df_reset = df_all.join(df_conversations_category.drop("airline_name", axis=1), on="conversation_id")
df_reset.size

40304992

In [13]:
def calculate_first_response_time_fast(df):
    # Check if all required columns are present
    required_columns = ['user_type', 'tweet_creation_time']
    for column in required_columns:
        if column not in df.columns:
            raise KeyError(f"DataFrame is missing required column: {column}")

    # Reset index to access conversation_id and tweet_order
    df_reset = df.reset_index()

    # Sort the dataframe by 'conversation_id' and 'tweet_order' for efficiency
    df_sorted = df_reset.sort_values(by=['conversation_id', 'tweet_order'])

    # Get the first user tweet time for each conversation
    first_user_tweets = df_sorted[df_sorted['user_type'] == 'User'].drop_duplicates('conversation_id', keep='first')
    first_user_tweets = first_user_tweets[['conversation_id', 'tweet_order', 'tweet_creation_time']]
    first_user_tweets.columns = ['conversation_id', 'first_user_tweet_order', 'first_user_tweet_time']

    # Merge first user tweet info back to the original dataframe
    df_merged = pd.merge(df_sorted, first_user_tweets, on='conversation_id', how='left', suffixes=('', '_first_user'))

    # Filter for airline tweets that come after the first user tweet in each conversation
    df_airline = df_merged[(df_merged['user_type'] == 'Airline') & 
                           (df_merged['tweet_order'] > df_merged['first_user_tweet_order'])]

    # Get the first airline tweet after the first user tweet for each conversation
    first_airline_tweets = df_airline.drop_duplicates('conversation_id', keep='first')

    # Calculate response times
    first_airline_tweets['response_time'] = (
        first_airline_tweets['tweet_creation_time'] - first_airline_tweets['first_user_tweet_time']
    ).dt.total_seconds() / 60

    return first_airline_tweets.set_index('conversation_id')[
        ['response_time']
    ]

df_conversations_category["response_time"] = calculate_first_response_time_fast(df_reset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_airline_tweets['response_time'] = (


In [14]:
df_lufthansa_conversations_category = df_conversations_category.query("airline_name == 'Lufthansa'").drop("airline_name", axis=1)
double_reset = df_reset.reset_index()
df_lufthansa_reset = double_reset[double_reset['conversation_id'].isin(df_lufthansa_conversations_category.index)].set_index(["conversation_id", "tweet_order"])


In [15]:
start_date = "1990-06-01"
end_date = "2030-06-01"

# Convert the start and end dates to tz-naive datetime objects
start_date_csv = pd.to_datetime(start_date).tz_localize(None)
end_date_csv = pd.to_datetime(end_date).tz_localize(None)

# Load the no_conversation CSV, parse the dates, and ensure they are timezone-naive
df_no_conversation = pd.read_csv(
    "no_conversation.csv",
    dtype={"user_id": "object"},
    index_col=["tweet_id"],
    parse_dates=["creation_time"]
)

# Ensure the 'creation_time' column is timezone-naive
df_no_conversation['creation_time'] = df_no_conversation['creation_time'].dt.tz_localize(None)

# Filter the DataFrame based on the date range
df_no_conversation = df_no_conversation.query('creation_time >= @start_date_csv and creation_time <= @end_date_csv')

# Rename the 'creation_time' column and merge with df_users DataFrame
df_no_conversation = df_no_conversation.rename(columns={'creation_time': 'tweet_creation_time'}).merge(
    df_users.rename(columns={'creation_time': 'user_creation_time'}),
    left_on='user_id', right_index=True, how='inner'
)

# Apply the get_full_language_name and get_country_name functions
df_no_conversation["lang"] = df_no_conversation["lang"].apply(get_full_language_name)
df_no_conversation["country_code"] = df_no_conversation["country_code"].apply(get_country_name)


In [16]:
user_tweets = df_lufthansa_conversations_category[["category", "user_sentiment_evolution"]].copy()

# Categorize sentiment evolution
user_tweets['sentiment_type'] = user_tweets['user_sentiment_evolution'].apply(
    lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral')
)

# Aggregate the data
sentiment_counts = user_tweets.groupby(['category', 'sentiment_type']).size().unstack(fill_value=0)

# Normalize the counts to percentages
sentiment_percentages = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0)

# Sort the categories by the percentage of negative sentiment
sentiment_percentages = sentiment_percentages.sort_values(
    by=["Negative", "Neutral", "Positive"]
)

# Create the Plotly stacked bar chart
fig = go.Figure()

# Define colors for each sentiment type
colors = {
    'Positive': '#00CC96',
    'Neutral': '#636EFA',
    'Negative': '#FF7F09'
}

# Add a bar trace for each sentiment type
for sentiment in sentiment_percentages.columns:
    fig.add_trace(
        go.Bar(
            y=sentiment_percentages.index,
            x=sentiment_percentages[sentiment],
            name=sentiment,
            text=(sentiment_percentages[sentiment] * 100).round(2).astype(str) + '%',
            textposition='inside',
            marker_color=colors[sentiment],
            orientation='h',
            textfont={"size": 16}
        )
    )

# Update the layout
fig.update_layout(
    barmode='stack',
    title={
        "text": "Normalised Stacked Bar Chart of User Sentiment Evolution per Category",
        "font": {"size": 28}
    },
    xaxis_title={"text": 'Percentage of Conversations', "font": {"size": 18}},
    yaxis={"tickfont": {"size": 18}},
    xaxis={"tickfont": {"size": 18}},
    xaxis_tickformat=".0%",
    width=1150,
    height=700,
    margin=dict(l=20, r=20, t=80, b=20),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5,
        font={"size": 18},
    )
)

In [17]:
tweet_counts = df_no_conversation['category'].value_counts().sort_values(ascending=False)

# Calculate the number of conversations per category
conversation_counts = df_lufthansa_conversations_category['category'].value_counts().reindex(tweet_counts.index, fill_value=0)

# Combine the counts into a single DataFrame
combined_counts = pd.DataFrame({
    'Unanswered Tweets': tweet_counts,
    'Conversations': conversation_counts
})

# Normalize the counts to percentages by each column total
combined_counts_normalized = combined_counts.div(combined_counts.sum(axis=0), axis=1) * 100

# Create the subplots figure with shared x-axis
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
                    subplot_titles=('Percentage of Conversations and Non-Responded Tweets per Category', 
                                    'Sentiment Score Distribution of Non-Responded Tweets per Category'),
                    vertical_spacing=0.05)

# Add the grouped bar chart for number of tweets and conversations per category
fig.add_trace(go.Bar(
    x=combined_counts_normalized.index,
    y=combined_counts_normalized['Unanswered Tweets'],
    text=combined_counts_normalized['Unanswered Tweets'].round(2).astype(str) + '%',
    textposition='inside',
    name='Unanswered Tweets',
    marker_color='#636EFA'
), row=1, col=1)

fig.add_trace(go.Bar(
    x=combined_counts_normalized.index,
    y=combined_counts_normalized['Conversations'],
    text=combined_counts_normalized['Conversations'].round(2).astype(str) + '%',
    textposition='inside',
    name='Conversations',
    marker_color='orange'
), row=1, col=1)

# Add a box trace for each category, sorted by number of tweets in ascending order
for category in tweet_counts.index:
    fig.add_trace(go.Box(
        y=df_no_conversation[df_no_conversation['category'] == category]["sentiment_score"],
        name=category,
        boxmean=True,  # Shows the mean line in the box plot
        showlegend=False  # Remove category from legend
    ), row=2, col=1)

user_tweets = df_lufthansa_conversations_category.query("user_sentiment_evolution != 0")[["category", "user_sentiment_mean"]]

# Calculate the median sentiment score for each category and sort the categories
category_medians = user_tweets.groupby("category")["user_sentiment_mean"].median().sort_values()

# Calculate the median of medians
median_of_medians = category_medians.median()

fig.add_trace(go.Scatter(
    x=category_medians.index,
    y=[median_of_medians] * len(category_medians),
    mode='lines',
    name='Median of Sentiment Evolution for Current Conversations',
    line=dict(color='orange', dash='dash')
), row=2, col=1)

# Update the layout
fig.update_layout(
    barmode='group',
    yaxis=dict(title='Percentage of Tweets or Conversations', tickfont=dict(size=18)),
    yaxis2=dict(title='Sentiment Score', tickfont=dict(size=18)),
    width=1100,
    height=1000,
    showlegend=True, 
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.9,
        orientation="v"
    ),
    title_font_size=32,
    font=dict(size=16),  # Set the font size for subplot titles
    margin=dict(l=20, r=20, t=40, b=20)  # Tight layout with no extra whitespace
)

# Update subplot titles font size
for annotation in fig['layout']['annotations']:
    annotation['font'] = dict(size=24)

# Show the plot
fig.show()


In [18]:
import plotly.graph_objects as go

user_tweets = df_conversations_category.query("conversation_length > 2")[["airline_name", "user_sentiment_evolution"]].copy()

# Categorize sentiment evolution
user_tweets['sentiment_type'] = user_tweets['user_sentiment_evolution'].apply(
    lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral')
)

# Aggregate the data
sentiment_counts = user_tweets.groupby(['airline_name', 'sentiment_type']).size().unstack(fill_value=0)

# Normalize the counts to percentages
sentiment_percentages = sentiment_counts.div(sentiment_counts.sum(axis=1), axis=0)

# Sort the categories by the percentage of negative sentiment
sentiment_percentages = sentiment_percentages.sort_values(
    by=["Positive", "Neutral", "Negative"]
)

# Create the Plotly stacked bar chart
fig = go.Figure()

# Define colors for each sentiment type
colors = {
    'Positive': '#00CC96',
    'Negative': '#FF7F09',
    'Neutral': '#636EFA',
}

# Add a bar trace for each sentiment type
for sentiment in sentiment_percentages.columns:
    fig.add_trace(
        go.Bar(
            x=sentiment_percentages.index,
            y=sentiment_percentages[sentiment],
            name=sentiment,
            text=(sentiment_percentages[sentiment] * 100).round(1).astype(str) + '%',
            textposition='inside',
            marker_color=colors[sentiment],
            textfont={"size": 18}
        )
    )

# Update the layout
fig.update_layout(
    barmode='stack',
    title={
        "text": "Normalised Stacked Bar Chart of User Sentiment Evolution per Airline<br>(with conversation length > 2)",
        "font": {"size": 24},
        "x": 0.05,
        "xanchor": "left",
        "yanchor": "top"
    },
    yaxis_title={"text": 'Percentage of Conversations', "font": {"size": 18}},
    xaxis={"tickfont": {"size": 18}},
    yaxis={"tickfont": {"size": 18}},
    yaxis_tickformat=".0%",
    width=1100,
    height=530,
    legend=dict(
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
        orientation="h",
        font=dict(size=18)
    ),
    margin=dict(l=20, r=20, t=80, b=20)  # Adjust the margin as needed
)

In [19]:
unique_no_conversation_users = df_no_conversation.drop_duplicates(subset='user_id')
unique_lufthansa_users = df_lufthansa_reset.query("user_type == 'User'").drop_duplicates(subset='user_id')

# Combine follower counts from both DataFrames
combined_followers = pd.concat([unique_no_conversation_users['followers_count'], unique_lufthansa_users['followers_count']])

# Calculate the percentiles for the combined follower counts
percentiles = np.linspace(0, 1, 11)
followers_percentiles = combined_followers.quantile(percentiles)

# Define bands using the combined percentiles
unique_no_conversation_users['followers_band'] = pd.cut(
    unique_no_conversation_users['followers_count'],
    bins=followers_percentiles,
    include_lowest=True,
    labels=[f'{int(followers_percentiles.iloc[i])}-{int(followers_percentiles.iloc[i+1])}' for i in range(len(followers_percentiles)-1)]
)

unique_lufthansa_users['followers_band'] = pd.cut(
    unique_lufthansa_users['followers_count'],
    bins=followers_percentiles,
    include_lowest=True,
    labels=[f'{int(followers_percentiles.iloc[i])}-{int(followers_percentiles.iloc[i+1])}' for i in range(len(followers_percentiles)-1)]
)

# Count the number of unique users in each band
user_counts_no_conversation = unique_no_conversation_users['followers_band'].value_counts().sort_index()
user_counts_lufthansa = unique_lufthansa_users['followers_band'].value_counts().sort_index()

# Normalize the counts to percentages
total_no_conversation = user_counts_no_conversation.sum()
total_lufthansa = user_counts_lufthansa.sum()

user_counts_no_conversation_normalized = (user_counts_no_conversation / total_no_conversation) * 100
user_counts_lufthansa_normalized = (user_counts_lufthansa / total_lufthansa) * 100

# Combine the normalized counts into a single DataFrame
combined_counts = pd.DataFrame({
    'Ignored Conversations': user_counts_no_conversation_normalized,
    'Conversations with Lufthansa': user_counts_lufthansa_normalized
}).fillna(0)

# Create the side-by-side bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=combined_counts.index,
    y=combined_counts['Ignored Conversations'],
    name='Unanswered tweets',
    marker_color='indianred',
    text=combined_counts['Ignored Conversations'].round(2).astype(str) + '%',
    textposition='inside',
    textfont={"size": 18},
))

fig.add_trace(go.Bar(
    x=combined_counts.index,
    y=combined_counts['Conversations with Lufthansa'],
    name='Conversations with Lufthansa',
    marker_color='lightsalmon',
    text=combined_counts['Conversations with Lufthansa'].round(2).astype(str) + '%',
    textposition='inside',
    textfont={"size": 18},

))

# Update the layout
fig.update_layout(
    barmode='group',
    title={
        "text": 'Unresponded Tweets and Conversation Distribution per User Followers Count',
        "font": {"size": 24}
    },
    xaxis_title={
        "text": 'User Followers Count',
        "font": {"size": 22}
    },
    yaxis_title={
        "text": 'Percentage of Conversations',
        "font": {"size": 22}
    },
    xaxis={
        "tickfont": {"size": 18}
    },
    yaxis={
        "tickfont": {"size": 18}
    },
    width=1100,
    height=550,
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="center",
        x=0.5,
        orientation="h",
        font={"size": 18}
    )
)

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [20]:
def calculate_t_score(r, n):
    return r * math.sqrt((n - 2) / (1 - r**2))

# Select relevant columns and drop rows with missing values
for_testing = df_lufthansa_conversations_category[["conversation_length",
                                         "airline_sentiment_min", "airline_sentiment_max", "airline_sentiment_mean",
                                         "response_time",
                                         "user_sentiment_mean", "user_sentiment_evolution"]].dropna()
# Convert all columns to numeric, coercing errors to NaN
for column in for_testing.columns:
    for_testing[column] = pd.to_numeric(for_testing[column], errors='coerce')

# Drop rows with NaN values after conversion
for_testing = for_testing.dropna()

variables = for_testing.columns.difference(['user_sentiment_mean', 'user_sentiment_evolution'])

# Initialize lists to store results
results = []

# Calculate Pearson correlation, t-score, and p-value for each variable
for var in variables:
    for sentiment in ['user_sentiment_mean', 'user_sentiment_evolution']:
        r, p_value = pearsonr(for_testing[sentiment], for_testing[var])
        n = len(for_testing[sentiment])
        t_score = calculate_t_score(r, n)
        results.append({
            'Sentiment': sentiment,
            'Variable': var,
            'Pearson Correlation': r,
            'P-value': p_value,
            'T-score': t_score
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)


# Create summary table
summary_table = results_df.pivot(index='Variable', columns='Sentiment', values=['Pearson Correlation', 'P-value']).round(5)

summary_table


Unnamed: 0_level_0,Pearson Correlation,Pearson Correlation,P-value,P-value
Sentiment,user_sentiment_evolution,user_sentiment_mean,user_sentiment_evolution,user_sentiment_mean
Variable,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
airline_sentiment_max,-0.01734,0.4331,0.07161,0.0
airline_sentiment_mean,0.03052,0.49681,0.00152,0.0
airline_sentiment_min,0.07358,0.49142,0.0,0.0
conversation_length,-0.18051,-0.07092,0.0,0.0
response_time,0.01125,-0.02706,0.24262,0.00494


# Statistics

In [21]:
conversation_count = df_lufthansa_conversations_category.shape[0]  # Assuming each row in df_reset represents a conversation
tweet_count = df_no_conversation.shape[0]  # Assuming each row in df_no_conversation represents a tweet

# Create a pie chart with these two values
fig = go.Figure(data=[go.Pie(
    labels=['Proper conversations', 'Not responded by Lufthansa'],
    values=[conversation_count, tweet_count],
    textinfo='label+percent',
    insidetextorientation='radial'
)])

fig.update_layout(
    title={
        "text": "Total Number of Conversations and not responded tweets",
        "font": {"size": 24}
    },
    showlegend=False,
)

In [22]:
category_counts = df_lufthansa_conversations_category["category"].value_counts()

# Get the top 10 categories
top_10_categories = category_counts.head(10)

# Sum the counts of all other categories
other_topics_count = category_counts.iloc[10:].sum()

# Create a Series for "Other topics"
other_topics_series = pd.Series([other_topics_count], index=["Other topics"])

# Concatenate the top 10 categories with the "Other topics" series
top_10_categories_with_other = pd.concat([top_10_categories, other_topics_series])

# Create the Plotly pie chart
fig = go.Figure(data=[go.Pie(
    labels=top_10_categories_with_other.index,
    values=top_10_categories_with_other.values,
    textinfo='label+percent',
    insidetextorientation='radial',
    hole=.2  # Makes it a donut chart to better visualize
)])

# Update the layout
fig.update_layout(
    title={
        "text": 'Number of Conversations per Category In Lufthansa',
        "font": {"size": 24}
    },
    width=1000,
    height=900,
    showlegend=False,
)

In [23]:
avg_conversation_length_per_category = df_lufthansa_conversations_category\
    .groupby("conversation_length").size()

truncate_at = 6
# Aggregate counts for conversation lengths 7 and more into a single category
lengths_to_group = avg_conversation_length_per_category.index[avg_conversation_length_per_category.index >= truncate_at]
grouped_data = avg_conversation_length_per_category.copy()
grouped_data.loc[truncate_at] = grouped_data.loc[lengths_to_group].sum()
grouped_data = grouped_data.drop(lengths_to_group[lengths_to_group != truncate_at])
grouped_data.index = grouped_data.index.astype(str)
grouped_data.index = grouped_data.index.str.replace(f'{truncate_at}', f'{truncate_at}+')

# Create the Plotly pie chart
fig = go.Figure(data=[go.Pie(
    labels=grouped_data.index,
    values=grouped_data.values,
    textinfo='label+percent',
    insidetextorientation='radial'
)])

# Update the layout
fig.update_layout(
    title={
        "text": "Distribution of Lufthansa's Conversations per Their Length",
        "font": {"size": 24}
    },
    width=800,
    height=800
)

In [24]:
conversations_per_airline = df_conversations_category.groupby("airline_name").size()

# Sort the conversation counts
sorted_conversations = conversations_per_airline.sort_values()

# Identify Lufthansa and its neighboring airlines
lufthansa_index = sorted_conversations.index.get_loc('Lufthansa')
neighboring_airlines = []

if lufthansa_index > 0:
    neighboring_airlines.append(sorted_conversations.index[lufthansa_index - 1])
if lufthansa_index < len(sorted_conversations) - 1:
    neighboring_airlines.append(sorted_conversations.index[lufthansa_index + 1])

# Assign colors based on whether the airline is Lufthansa or its neighboring airlines
colors=[]
for airline in sorted_conversations.index:
    if airline == 'Lufthansa':
        colors.append('#00CC96')
    elif airline in neighboring_airlines:
        colors.append('#FF7F09')
    else:
        colors.append('#636EFA')

# Create the Plotly bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=sorted_conversations.index,
    y=sorted_conversations.values,
    marker_color=colors,
    text=sorted_conversations.values,
    textposition='outside'
))

# Update the layout
fig.update_layout(
    title={
        "text": "Number of Conversations per Airline",
        "font": {"size": 24}
    },
    xaxis_title={"text": 'Airline', "font": {"size": 18}},
    yaxis_title={"text": 'Number of Conversations', "font": {"size": 18}},
    xaxis_tickangle=-45,
    xaxis={"tickfont": {"size": 14}},
    yaxis={"tickfont": {"size": 14}},
    width=1300,
    height=700
)

In [25]:
# End timing
end_time = time.time()
print(f"Total execution time: {(end_time - start_time)} seconds")

Total execution time: 32.487937688827515 seconds
