# Analyzing a Telegram Group Chat

In [None]:
# # Checking what version we have

# import sys
# import platform
# print(sys.version)

In [None]:
import tensorflow as tf

# Check if a GPU is available
if tf.test.gpu_device_name():
    print('GPU is available')
else:
    print('GPU is NOT available')

In [None]:
# !pip install germansentiment

import json
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns  # Import seaborn for easy color palettes

pd.options.mode.chained_assignment = None  # default='warn'

## Data collection

On the desktop version of Telegram select the caht you want to analyze.

Go to to right corner and select the drop down ... menu and then "export caht history"

---------------

For this Notebook, we will only look at text messages, so selecting different is optional.

Click on format and select "Machine-readable JSON"

---------------

Then determine the path and begin the export.

Once complete the messages will be stored in a ChatExport_YYYY_MM_DD folder in a result.json file.

From here onwards we will work with this file.

## Data transformation

In [None]:
json_path = "./result.json"

with open('./result.json', 'r', encoding="utf8") as f:
    data = json.load(f)
df = pd.DataFrame(data["messages"])


In [None]:

# The following code loops through this dictionary to extract the text elements.
def get_message_text_from_json(data):
    '''
    For this applied case, the input should be data["message"].
    Hence the part of the json data which contains message information.
    If a person is being referenced, the text is stored as another dictionary.
    The function then goes through all lines extract the message text from all types of messages.
    '''
    data_message_text = []
    for i in data:
        try:
            i["text"][0]["text"]
        except TypeError:
            message_text = i["text"]
        except IndexError:
            message_text = i["text"]
        else:
            odd_case = i["text"]
            odd_case_text = ""
            for j in odd_case:
                try:
                    j["text"]
                except TypeError:
                    odd_case_text += j
                else:
                    odd_case_text += j["text"]
            message_text = odd_case_text
        data_message_text.append(message_text)
    return data_message_text



In [None]:
# creating another column with the message text cleaned and concatenated
df["text_cleaned"] = get_message_text_from_json(data["messages"])

#checking if the lines are still alligned
print(df["text_cleaned"])
print("")
print(df["text"])

In [None]:
# shortening the names to only first and last name

# Function to shorten names to the first three words
def shorten_name(name):
    if isinstance(name, str):
        words = name.split()[:2]
        return ' '.join(words)
    else:
        return name


# Apply the function to the 'from' column
df['from'] = df['from'].apply(shorten_name)

# Display the updated DataFrame
print(df["from"].unique())

In [None]:
df.columns

In [None]:
name_counts = df['from'].value_counts()

# Plot the bar plot
name_counts.plot(kind='bar', figsize=(14, 6), color="pink")
plt.title('Number of Messages for Each Name')
plt.xlabel('Name')
plt.ylabel('Message Count')
plt.grid(axis='y', linestyle='--', linewidth=1, color='lightgrey')
plt.show()

In [None]:
# Convert 'date' to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort DataFrame by the first date in ascending order
sorted_names = df.groupby('from')['date'].min().sort_values().index
df['from'] = pd.Categorical(df['from'], categories=sorted_names, ordered=True)
df.sort_values(['from', 'date'], inplace=True)

# Create a figure and axis
fig, ax = plt.subplots(figsize=(14, 6))

# Plot horizontal lines for each name, indicating first and last occurrences
for i, (name, group) in enumerate(df.groupby('from')):
    first_date = group['date'].iloc[0]
    last_date = group['date'].iloc[-1]
    color = sns.color_palette("husl", n_colors=len(df['from'].unique()))[i]  # Use seaborn color palette
    ax.plot([name, name], [first_date, last_date], marker='o', color=color)

# Set axis labels and title
ax.set_xlabel('Name')
ax.set_ylabel('Timeline')
ax.set_title('Timeline of First and Last Occurrences for Each Name')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)

# Add very light x gridlines in between the names
ax.grid(axis="x", linestyle='--', linewidth=1, color='lightgrey')

# Show the plot
plt.show()

In [None]:
subset_cols = ["from", "date"]
subset_df = df[subset_cols]
subset_df['hour'] = subset_df['date'].dt.hour
hour_counts = subset_df.value_counts("hour")
hour_counts = pd.DataFrame(hour_counts)
hour_counts = hour_counts.reset_index()
hour_counts= hour_counts.sort_values(by="hour")
try:
  hour_counts["count"]
except KeyError:
  hour_counts.rename(columns={0: "count"}, inplace=True)
hour_counts.columns

# Plotting the value count by hour in a barplot which will make it look like a histogram
hour_counts.plot(kind="bar", x="hour", y="count", legend=None, color="pink", figsize=(10, 6))

plt.title("Distribution of Group Chat Activity by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Message Count")
# Add very light x gridlines in between the names
plt.grid(axis="y", linestyle='--', linewidth=1, color='lightgrey')



In [None]:
# Who is the most active contributor at which time (only peole with more than 500 messages)
# only_200
from_counts = pd.DataFrame(df.value_counts("from")).reset_index()
try:
  from_counts["count"]
except KeyError:
  from_counts.rename(columns={0: "count"}, inplace=True)
from_counts
names_200 = from_counts[from_counts["count"] > 500]["from"].to_list() # not 500 but I was too lazy to refactor the variable names after this

subset_cols = ["from", "date"]
subset_df = df[subset_cols]
subset_df['hour'] = subset_df['date'].dt.hour
subset_200_df = subset_df[subset_df["from"].isin(names_200)]
subset_200_df["from"] = subset_200_df["from"].cat.remove_unused_categories() # this is important because otherwise filtered out people still get a spot in the legend


only_name_hour_df = subset_200_df.drop(columns="date")
only_name_hour_df = pd.DataFrame(only_name_hour_df.value_counts()).reset_index()
try:
  only_name_hour_df["count"]
except KeyError:
  only_name_hour_df.rename(columns={0: "count"}, inplace=True)

# Calculate the total counts per person
total_counts = only_name_hour_df.groupby('from', observed=True)['count'].sum()
# Convert counts to percentages
only_name_hour_df['percentage'] = round(only_name_hour_df['count'] / only_name_hour_df['from'].map(total_counts),3)


# Sort the DataFrame by 'from' and 'hour'
only_name_hour_df = only_name_hour_df.sort_values(by=['from', 'hour'])

# Set up the plot
fig, ax = plt.subplots(figsize=(14, 10))

# Define a color palette
color_palette = sns.color_palette("viridis", n_colors=len(only_name_hour_df['from'].unique()))

# Plot a grouped bar plot for each person
sns.barplot(x='hour', y='percentage', hue='from', data=only_name_hour_df, palette=color_palette, ax=ax)

# Add a smooth line (kernel density estimate) over the grouped bar plot
for i, (name, group) in enumerate(only_name_hour_df.groupby('from', observed=True)):
    sns.kdeplot(group['hour'], fill=True, alpha=0.02, linewidth=0.5, color=color_palette[i], ax=ax)

# Set labels and title
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Percentage of Person\'s Messages')
ax.set_title('Group Chat Activity per Person for Persons with more than 500 Messages')
# Move the legend to the right
ax.legend(title='Name', bbox_to_anchor=(1.05, 0.5), loc='center left')
plt.show()

In [None]:
# now as a heatmap

# Convert 'date' to datetime format
df['date'] = pd.to_datetime(df['date'])

# Create a new column for the hour of the day
df['hour'] = df['date'].dt.hour

# Filter data to include only people with at least 100 name occurrences
name_counts = df['from'].value_counts()
valid_names = name_counts[name_counts >= 100].index

# Filter the DataFrame to include only rows with valid names
df_filtered = df[df['from'].isin(valid_names)]
df_filtered["from"]= df_filtered["from"].cat.remove_unused_categories()


# Pivot the filtered DataFrame to have 'from' on one axis, 'hour' on the other, and values as percentages
heatmap_data = (df_filtered.pivot_table(index='from', columns='hour', aggfunc='size', fill_value=0) /
                df_filtered.groupby('from').size().values[:, None]) * 100

# Create a figure and axis
fig, ax = plt.subplots(figsize=(16, 12))

# Create a heatmap using seaborn with the 'coolwarm' color palette
sns.heatmap(heatmap_data, cmap='rocket', annot=True, fmt='.1f', linewidths=.5, cbar_kws={'label': 'Percentage'}, ax=ax)

# Set axis labels and title
ax.set_xlabel('Hour of the Day')
ax.set_ylabel('Name')
ax.set_title('Heatmap of Activity Percentage by Hour of the Day and by Person (Minimum 100 Messages)')

# Set y-axis ticks to match the number of valid names
ax.set_yticks(range(len(valid_names)))
# Set y-axis tick labels to valid names
ax.set_yticklabels(valid_names)

# Show the plot
plt.show()

## Sentiment Analysis

In [None]:
from germansentiment import SentimentModel
from numpy import NaN
import math

model = SentimentModel()

text = ["so ein Idiot", "ja", "Wünsche euch einen herrlichen tag", "dies ist neutral"]

predictions = model.predict_sentiment(texts=text, output_probabilities=True)


positive_scores = []
negative_scores = []
neutral_scores = []

for i in predictions[1]:
    positive_scores.append(i[0][1])
    negative_scores.append(i[1][1])
    neutral_scores.append(i[2][1])

positive_scores = pd.DataFrame(positive_scores)
negative_scores = pd.DataFrame(negative_scores)
neutral_scores = pd.DataFrame(neutral_scores)
print(positive_scores)
# print(negative_scores)
# print(neutral_scores)
test_list = ['negaaaative', 'neutral', 'negative', 'negative', 'negative', 'positive', 'neutraaaaal']
test_list.extend(predictions[0])
test_list_df = pd.DataFrame(test_list)
print(test_list_df)


In [None]:
from IPython.display import Javascript

from germansentiment import SentimentModel
from numpy import NaN
import math

model = SentimentModel()

# Increase the data rate limit to 1e9 bytes/sec
Javascript("Jupyter.notebook.iopub_data_rate_limit = 1e9;")

Let us run some example texts to see if the model is working

In [None]:
# test out the model

text = ["so ein Idiot", "ja", "Wünsche euch einen herrlichen tag", "dies ist neutral"]

predictions = model.predict_sentiment(texts=text, output_probabilities=True)

print(predictions)

To predict all text sentiments at once, we need to batch the input. We do this process and storing the results in the function below

In [None]:
# we take the predict function from the germansentiment model `predict_sentiment` that takes a list of texts and returns predictions
# this function batches the input texts to batch_size and then predicts 50 at once

def predict_sentiment_in_batches(texts, batch_size=50):

    message_sentiment_rating = []
    positive_scores = []
    negative_scores = []
    neutral_scores = []

    num_batches = len(texts) // batch_size + (len(texts) % batch_size > 0)

    for i in range(num_batches):
        if i % 50 == 0:
          print(f"running batch {i}/{num_batches}")
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size

        batch_texts = texts[start_idx:end_idx].astype(str).replace("", "dies ist neutral")
        batch_results = model.predict_sentiment(texts=batch_texts, output_probabilities=True)

        # adding the batch results to the dictionaries
        message_sentiment_rating.extend(batch_results[0]) # dont ask me why it is now extend and not append
        for i in batch_results[1]:
          positive_scores.append(i[0][1])
          negative_scores.append(i[1][1])
          neutral_scores.append(i[2][1])

    # after all batches have run, convert dicts to dfs and merge them together
    message_sentiment_rating = pd.DataFrame(message_sentiment_rating)
    positive_scores = pd.DataFrame(positive_scores)
    negative_scores = pd.DataFrame(negative_scores)
    neutral_scores = pd.DataFrame(neutral_scores)

    results_df = pd.DataFrame()
    results_df["message_sentiment_rating"] = message_sentiment_rating
    results_df["positive_scores"] = positive_scores
    results_df["negative_scores"] = negative_scores
    results_df["neutral_scores"] = neutral_scores


    return results_df

now let us run the function

In [None]:
test_texte = df["text_cleaned"]
test_texte = test_texte.astype(str)
test_texte = test_texte.replace("", "dies ist neutral")

# Process texts in batches
batch_size = 50
text_message_sentiment = predict_sentiment_in_batches(test_texte, batch_size=batch_size)

# Display results
text_message_sentiment[:10]

In [None]:
# merging the predictions with the original data
df_with_sentiment = pd.concat((text_message_sentiment, df), axis=1)
df_with_sentiment

In [None]:
# Group by 'name' and calculate the mean for each group
columns_for_visual = ["from", "positive_scores", "negative_scores", "neutral_scores"]
subset_df_for_visual = df_with_sentiment[columns_for_visual]
grouped_df = subset_df_for_visual.groupby('from').mean()

# Plot the grouped bar plot
grouped_df.plot(kind='bar', figsize=(10, 6))
plt.title('Average Senttiment Scores for Each Name')
plt.xlabel('Name')
plt.ylabel('Average Score')
plt.legend(title='Score Type')
plt.grid(axis='y')
plt.show()


In [None]:
# Filter data to include only people with at least 50 name occurrences
name_counts = df_with_sentiment['from'].value_counts()
valid_names = name_counts[name_counts >= 50].index

# Filter the DataFrame to include only rows with valid names
df_filtered = df_with_sentiment[df_with_sentiment['from'].isin(valid_names)]
df_filtered["from"]= df_filtered["from"].cat.remove_unused_categories()



columns_for_visual = ["from", "message_sentiment_rating"]
subset_df_for_visual = df_filtered[columns_for_visual]

# Group by 'name' and 'sentiment_rating' and get the relative counts
grouped_df = subset_df_for_visual.groupby(['from', 'message_sentiment_rating']).size().unstack().fillna(0)
grouped_df = grouped_df.div(grouped_df.sum(axis=1), axis=0) * 100  # Normalize to get percentages


# Define custom colors for each sentiment rating
colors = {'positive': 'green', 'neutral': 'grey', 'negative': 'red'}

# Sort columns by the highest to lowest percentage count of "negative"
grouped_df = grouped_df.reindex(grouped_df.sort_values(by='negative', ascending=False).index, axis=0)

# Plot the 100% stacked bar plot with custom colors
ax = grouped_df.plot(kind='bar', stacked=True, figsize=(14, 7), color=[colors[col] for col in grouped_df.columns])
plt.title('Sentiment Ratings of Message Text for Each Name (min 50 messages)')
plt.xlabel('Name')
plt.ylabel('Percentage')
plt.legend(title='Sentiment Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', zorder=0, color='black', linestyle='dotted', linewidth=0.3)


# Add percent labels to each part of the bar plot
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    label_text = f'{height:.0f}%'
    ax.text(x + width/2, y + height/2, label_text, ha='center', va='center', fontsize=8, color='white', rotation=90)

plt.show()