# Imports

In [None]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "_0_Constants_and_Utils"))


from viz_constants import (QUERY_TWEETS, QUERY_USERS, QUERY_REPLY,
                           COMPANY_NAME_TO_ID, COMPANY_ID_TO_NAME,
                           DTYPES_TWEETS, DTYPES_USERS)
from viz_helpers import get_country_name, get_full_language_name, get_size_of
from database_utils import get_dataframe_from_query, form_connection_params

# Loading

In [None]:
# Set local = False if you want to query the online MySQL database
local = True
connection_params = form_connection_params(local, True)


In [None]:
df_users = get_dataframe_from_query(QUERY_USERS, connection_params, local, DTYPES_USERS, "user_id", parse_dates=["creation_time"])
df_users

In [None]:
df_tweets = get_dataframe_from_query(QUERY_TWEETS, connection_params, local, DTYPES_TWEETS, "tweet_id", parse_dates=["creation_time"])
df_tweets

In [None]:
test_data = df_tweets.rename(columns={'creation_time': 'tweet_creation_time'})\
    .merge(df_users.rename(columns={'creation_time': 'user_creation_time'}),
           left_on='user_id', right_index=True, how='inner')
test_data

In [None]:
df_reply = get_dataframe_from_query(QUERY_REPLY, connection_params, local, index_col="tweet_id")
df_reply

In [None]:
test_data.info()

In [None]:
get_size_of(test_data.memory_usage(index=True, deep=True).sum())

In [None]:
test_data.describe()

# Visualisations

## Database size

In [None]:
total_lines = 6511404 #total number of lines if you combine all raw jsons
tweets_right_now = len(test_data)

# Example data
values = [total_lines, tweets_right_now]
labels = ["Number of possible tweets", "Number of stored tweets"]

plt.figure(figsize=(10, 7))
bars = plt.bar(labels, values, color=sns.color_palette("viridis", len(labels)))

# Add labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2 - 0.1, yval + 50000, f'{yval:,}', fontsize=12, weight='bold')

# Customize the chart
plt.title('Comparison of tweets provided vs stored', fontsize=16, weight='bold')
plt.ylabel('Number of Tweets', fontsize=14, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold');

In [None]:
data = [258, 414685, 190928, 2326, 15] # was done with another script
labels = ['Not a tweet', 'Duplicate tweet', 'Inhuman language',
          'No tweet id', "Invalid user"]

# Ensure data and labels have the same length
assert len(data) == len(labels), "Data and labels must be the same length."

# Sort the data and labels in decreasing order
sorted_data_labels = sorted(zip(data, labels), reverse=True)
data, labels = zip(*sorted_data_labels)

# Choose a color palette

# Create the bar chart
plt.figure(figsize=(20, 8))
bars = plt.bar(labels, data, color=sns.color_palette("viridis", len(labels)))

# Add labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2 - 0.1, yval + 5000, f'{yval:,}',
             fontsize=12, weight='bold')

# Customize the chart
plt.title('Numbers of potential tweets not considered per category',
          fontsize=16, weight='bold')
plt.ylabel('Number of Tweets', fontsize=14, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold');

In [None]:
total_lines = 35.2
mysql = 2.2

values = [total_lines, mysql]
labels = ["Raw data", "MySQL storage"]

plt.figure(figsize=(10, 8))
bars = plt.bar(labels, values, color=sns.color_palette("viridis", len(labels)))  

# Add labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval+0.5, f'{yval:,}',
             fontsize=12, weight='bold')

# Customize the chart
plt.title('Comparison of storage required', fontsize=16, weight='bold')
plt.ylabel('Storage, GB', fontsize=14, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold');

In [None]:
total_lines = 35.2
mysql = 2.2

values = [total_lines, mysql]
labels = ["Raw data", "MySQL storage"]

plt.figure(figsize=(10, 8))
bars = plt.bar(labels, values, color=sns.color_palette("viridis", len(labels)))  

# Add labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval+0.5, f'{yval:,}',
             fontsize=12, weight='bold')

# Customize the chart
plt.title('Comparison of storage required', fontsize=16, weight='bold')
plt.ylabel('Storage, GB', fontsize=14, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold');

## Language related

In [None]:
df_lang_distr = test_data.groupby('lang', observed=True)\
    [["user_id"]].count().sort_values('user_id', ascending=False)\
        .rename(columns={"user_id":"Tweets number"})
df_lang_distr

In [None]:
df_top_lang = df_lang_distr.nlargest(5, 'Tweets number')

df_other_lang = df_lang_distr.loc[~df_lang_distr.index.isin(df_top_lang.index)]

df_lang_agg = df_other_lang.sum()
df_lang_agg.name = 'Other languages'

df_lang_agg_final = pd.concat([df_top_lang, df_lang_agg.to_frame().T])
df_lang_agg_final.index.name = 'Language'
df_lang_agg_final.index = df_lang_agg_final.index.map(get_full_language_name)
df_lang_agg_final

In [None]:
plt.figure(figsize=(10, 8))
plt.pie(df_lang_agg_final["Tweets number"], labels=df_lang_agg_final.index,
        startangle=140, textprops={'fontsize': 14, 'weight': 'bold'},
        colors=sns.color_palette("viridis", len(df_lang_agg_final)))
plt.title('Distribution of tweets per language', fontsize=16, weight='bold')
plt.legend(df_lang_agg_final.index, title="Countries",
           bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12);

## Country of origin

In [None]:
df_country_dist = test_data.groupby('country_code', observed=True)\
    [["user_id"]].count().sort_values('user_id', ascending=False)\
        .rename(columns={"user_id":"Tweets number"})
df_country_dist.index = df_country_dist.index.map(get_country_name)
df_country_dist

In [None]:
df_top_country = df_country_dist.nlargest(1, 'Tweets number')
df_other_country = df_country_dist.loc[~df_country_dist.index.isin(df_top_country.index)]
df_country_agg = df_other_country.sum()

df_country_agg.name = 'Other countries'
df_country_agg_final = pd.concat([df_top_country, df_country_agg.to_frame().T])
df_country_agg_final.index.name = 'Country'
df_country_agg_final

In [None]:
plt.figure(figsize=(10, 8))
plt.pie(df_country_agg_final['Tweets number'], labels=df_country_agg_final.index,
        autopct='%1.1f%%', startangle=140,
        textprops={'fontsize': 14, 'weight': 'bold'})
plt.title("Number of tweets per known countries", fontsize=16, weight='bold')
plt.legend(df_country_agg_final.index, title="Countries",
           bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12);

In [None]:
df_country_known = df_country_dist[df_country_dist.index != "Unknown Country"].copy()

top_10_countries = df_country_known.nlargest(5, 'Tweets number')

other_countries_df = df_country_known.loc[~df_country_known.index.isin(top_10_countries.index)]

other_countries_agg = other_countries_df.sum()
other_countries_agg.name = 'Other countries'


df_top_country_known = df_country_known.nlargest(5, 'Tweets number')
df_other_country_known = df_country_known.loc[~df_country_known.index.isin(df_top_country_known.index)]
df_country_known_agg = df_other_country_known.sum()

df_country_known_agg.name = 'Other countries'
df_country_known_agg_final = pd.concat([df_top_country_known, df_country_known_agg.to_frame().T])
df_country_known_agg_final.index.name = 'Country'
df_country_known_agg_final

In [None]:
plt.figure(figsize=(10, 8))
plt.pie(df_country_known_agg_final["Tweets number"], labels=df_country_known_agg_final.index,
        autopct='%1.1f%%', startangle=140, textprops={'fontsize': 14, 'weight': 'bold'}, )
plt.title("Distribution of tweets per known countries", fontsize=16, weight='bold')
plt.legend(df_country_known_agg_final.index, title="Countries",
           bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12);

## Tweets from main accounts of the airlines

In [None]:
avia_names = set(COMPANY_NAME_TO_ID.values())

replies_to_avia_companies_df = test_data.loc[test_data['user_id'].apply(
    lambda x: any(x == avia_name for avia_name in avia_names)
    )]
replies_to_avia_companies_df = replies_to_avia_companies_df.reset_index()\
    .groupby("user_id").count()[['tweet_id']]\
        .sort_values('tweet_id', ascending=False).reset_index()
replies_to_avia_companies_df["user_id"] = replies_to_avia_companies_df["user_id"]\
    .apply(lambda user_id: COMPANY_ID_TO_NAME.get(user_id, user_id))
replies_to_avia_companies_df = replies_to_avia_companies_df.set_index("user_id")
replies_to_avia_companies_df

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(25,10))
sns.barplot(data=replies_to_avia_companies_df, ax=ax, x='user_id', y='tweet_id',
            palette=sns.color_palette("viridis", replies_to_avia_companies_df.index.nunique()),
            hue='user_id', dodge=False)
# Customize the chart
plt.title('Number of tweets by airline company', fontsize=16, weight='bold')
plt.ylabel('Number of tweets', fontsize=14, weight='bold')
plt.xlabel('', fontsize=14, weight='bold')
plt.xticks(fontsize=12, weight='bold')
plt.yticks(fontsize=12, weight='bold');

## Replies to company posts

In [None]:
df_reply["tweet_creation_time"] = pd.to_datetime(df_reply["tweet_creation_time"])
df_reply["original_tweet_creation_time"] = pd.to_datetime(df_reply["original_tweet_creation_time"])
df_reply["response_time"] = df_reply["tweet_creation_time"] - df_reply["original_tweet_creation_time"]
df_reply

In [None]:
df_reply["airline"] = df_reply["user_id"].map(COMPANY_ID_TO_NAME)
df_reply["original_airline"] = df_reply["original_user_id"].map(COMPANY_ID_TO_NAME)
df_reply

In [None]:
# Convert datetime and timedelta columns
df_reply['response_time'] = pd.to_timedelta(df_reply['response_time'])

# Calculate average response time per airline
average_response_time_airline = df_reply[df_reply['airline'].notnull()]\
    .groupby('airline')['response_time'].mean()

# Calculate average response time for others users to react to each airline
average_response_time_reactions = df_reply[df_reply['original_airline'].notnull()]\
    .groupby('original_airline')['response_time'].mean()

# Combine the results into one DataFrame for plotting
df_airline_response = pd.DataFrame({
    'Airline Response Time': average_response_time_airline.reindex(
        average_response_time_airline.index.union(average_response_time_reactions.index)
        ),
    'User Reaction Time': average_response_time_reactions.reindex(
        average_response_time_airline.index.union(average_response_time_reactions.index)
        )
})
df_airline_response.index.name = "Airline"
df_airline_response

In [None]:
df_airline_response_plot = df_airline_response.dropna().copy()
df_airline_response_plot['Airline Response Time, hours'] = df_airline_response_plot['Airline Response Time'].dt.total_seconds() / 3600
df_airline_response_plot['User Reaction Time, days'] = df_airline_response_plot['User Reaction Time'].dt.total_seconds() / 86.400

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airline_response_plot.sort_values('Airline Response Time, hours'),
            x='Airline', y='Airline Response Time, hours', ax=ax,
            palette=sns.color_palette("viridis", df_airline_response_plot.index.nunique()),
            hue='Airline', dodge=False)
ax.set_title('Average airline customer service response time',
             fontsize=16, weight='bold')
ax.set_ylabel('Airline Response Time, hours', fontsize=14, weight='bold');

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airline_response_plot.sort_values('User Reaction Time, days'),
            x='Airline', y='User Reaction Time, days', ax=ax,
            palette=sns.color_palette("viridis", df_airline_response_plot.index.nunique()),
            hue='Airline', dodge=False)
ax.set_title('Average user reaction time to airline tweet', fontsize=16, weight='bold')
ax.set_ylabel('User Reaction Time, days', fontsize=14, weight='bold');

In [None]:
df_reply_luft = df_reply.groupby("airline")
# df_reply_luft.reset_index().set_index("original_tweet_id")
df_reply_luft.describe()

## Companies' activity and popularity in social media

In [None]:
df_airlines_popularity = test_data[test_data['user_id']\
    .apply(lambda x: any(x == avia_name for avia_name in avia_names))]\
        .groupby("user_id")\
            .agg(
                retweet_count=("retweet_count", "sum"),
                favorite_count=("favorite_count", "sum"),
                reply_count=("reply_count", "sum"),
                quote_count=("quote_count", "sum"),
            )
df_airlines_popularity.index = df_airlines_popularity.index.map(
    lambda user_id: COMPANY_ID_TO_NAME[user_id]
    )
df_airlines_popularity.index.name = "Airlines"
df_airlines_popularity

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airlines_popularity, x='Airlines', y="retweet_count", ax=ax,
            palette=sns.color_palette("viridis", df_airlines_popularity.index.nunique()),
            hue='Airlines', dodge=False)
ax.set_title("Total retweets count of each airlines' tweets", fontsize=16, weight='bold')
ax.set_ylabel("Retweet count");

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airlines_popularity, x='Airlines', y="favorite_count", ax=ax,
            palette=sns.color_palette("viridis", df_airlines_popularity.index.nunique()),
            hue='Airlines', dodge=False)
ax.set_title("Total favourite count of each airlines' tweets", fontsize=16, weight='bold')
ax.set_ylabel("Favourite count");

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airlines_popularity, x='Airlines', y="reply_count", ax=ax,
            palette=sns.color_palette("viridis", df_airlines_popularity.index.nunique()),
            hue='Airlines', dodge=False)
ax.set_title("Total reply count of each airlines' tweets", fontsize=16, weight='bold')
ax.set_ylabel("Reply count");

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(18, 8))
sns.barplot(df_airlines_popularity, x='Airlines', y="quote_count", ax=ax,
            palette=sns.color_palette("viridis", df_airlines_popularity.index.nunique()),
            hue='Airlines', dodge=False)
ax.set_title("Total quote count of each airlines' tweets", fontsize=16, weight='bold')
ax.set_ylabel("Quote count");

## Information regarding users

In [None]:
df_users = test_data.groupby("user_id")
df_users = df_users.agg(
    user_creation_time=("user_creation_time", "min"),
    verified=("verified", "min"),
    followers_count=("followers_count", "min"),
    friends_count=("friends_count", "min"),
    statuses_count=("statuses_count", "min"),
    default_profile=("default_profile", "min"),
    default_profile_image=("default_profile_image", "max"),
    first_tweet=("tweet_creation_time", "min"),
    last_tweet=("tweet_creation_time", "max"),
    possibly_sensitive=("possibly_sensitive", "sum"),
    favorite_count=("favorite_count", "sum"),
    retweet_count=("retweet_count", "sum"),
    reply_count=("reply_count", "sum"),
    quote_count=("quote_count", "sum"),
    lang=("lang", "first")
)
df_users.head()

In [None]:
df_users.describe()

### Custom user "trustworthiness" classification

In [None]:
df_verified = df_users.groupby("verified").agg(verified=("user_creation_time", "count"))
df_verified.index = df_verified.index.map(
    lambda x: {True: "Verified", False: "Not Verified"}[x]
    )

plt.figure(figsize=(10, 8))
plt.pie(df_verified["verified"], labels=df_verified.index, autopct='%1.1f%%',
        startangle=140, textprops={'fontsize': 14, 'weight': 'bold'})
plt.title("Verified User Ratio", fontsize=16, weight='bold')
plt.legend(df_verified.index, title="Countries", bbox_to_anchor=(1.05, 1),
           loc='upper left', fontsize=12)
df_verified

In [None]:
default_profile = df_users.groupby("default_profile").agg(default_profile=("user_creation_time", "count"))
default_profile.index = default_profile.index.map(lambda x: {True: "Default Profile", False: "Unique Profile"}[x])

plt.figure(figsize=(10, 8))
plt.pie(default_profile["default_profile"], labels=default_profile.index,
        autopct='%1.1f%%', startangle=140,
        textprops={'fontsize': 14, 'weight': 'bold'})
plt.title("Verified User Ratio", fontsize=16, weight='bold')
plt.legend(default_profile.index, title="Countries", bbox_to_anchor=(1.05, 1),
           loc='upper left', fontsize=12)
default_profile

## Tweets information

In [None]:
df_sensitive = test_data.groupby("possibly_sensitive")[["user_id"]].count()
df_sensitive.index = df_sensitive.index.map(lambda x: {True: "Sensitive", False: "Not Sensitive"}[x])

plt.figure(figsize=(10, 8))
plt.pie(df_sensitive["user_id"], labels=df_sensitive.index, autopct='%1.1f%%',
        startangle=140, textprops={'fontsize': 14, 'weight': 'bold'})
plt.title("Sensitive tweet ratio", fontsize=16, weight='bold')
plt.legend(df_sensitive.index, title="Countries", bbox_to_anchor=(1.05, 1),
           loc='upper left', fontsize=12)
df_sensitive