In [None]:
import json
import pandas as pd
import plotly.express as px

# convert json into dict
with open(input("Input directory for result.json: ")+"result.json") as json_file:
    data = json.load(json_file)

In [None]:
chats_list = data["chats"]["list"]

# add message count to data
for chat in chats_list:
    chat["message_count"] = len(chat["messages"])

In [None]:
# normalize chat data
chats_df = pd.json_normalize(chats_list).sort_values(by="message_count", ascending=False, ignore_index=True)
display(chats_df)

## GENERAL CHAT ANALYSIS

In [None]:
# visualise message_count per chat
fig = px.bar(chats_df, x="name", y="message_count")
fig.update_layout(title="Message count per chat (all chats)",
                    xaxis_title="chat",
                    yaxis_title="message_count")
fig.write_html('messages_per_chat_all.html', auto_open=True)
fig.show()

In [None]:
# visualise message_count per personal chat
fig = px.bar(chats_df[(chats_df["message_count"] >= 500) & (chats_df["type"] == "personal_chat")], x="name", y="message_count")
fig.update_layout(title="Message count per chat (personal chats)",
                    xaxis_title="chat",
                    yaxis_title="message_count")
fig.write_html('messages_per_chat_personal.html', auto_open=True)
fig.show()

In [None]:
# visualise message_count per group/supergroup chat
fig = px.bar(chats_df[(chats_df["message_count"] >= 500) & (chats_df["type"] != "personal_chat")], x="name", y="message_count")
fig.update_layout(title="Message count per chat (group chats)",
                    xaxis_title="chat",
                    yaxis_title="message_count")
fig.write_html('messages_per_chat_groups.html', auto_open=True)
fig.show()

## INDIVIDUAL CHAT ANALYSIS

### Initialising and cleaning data

In [None]:
name = input("Input name to get chat index: ")
try:
    print(chats_df[chats_df["name"]==name].index[0])
except:
    print("No such name found")

In [None]:
chat_df = pd.json_normalize(chats_df.iloc[int(input("Input chat index: "))]["messages"])

In [None]:
chat_df.info()
# chat_df.to_csv("chat.csv")

In [None]:
# data cleaning: adding date_only, monthyear, duration_minutes columns

chat_df["date_only"] = pd.to_datetime(chat_df['date']).dt.date
chat_df["monthyear"] = pd.to_datetime(chat_df['date']).dt.strftime('%Y-%m')
chat_df["duration_minutes"] = chat_df["duration_seconds"] / 60

In [None]:
# data cleaning: cleanup media types to include photos, files, pins, calls, locations, polls, regular texts, etc.

chat_df.loc[pd.notnull(chat_df["photo"]), "media_type"] = "photo"
chat_df.loc[pd.isnull(chat_df["media_type"]) & pd.notnull(chat_df["file"]), "media_type"] = "file"
if("action" in chat_df.columns):
    chat_df.loc[pd.isnull(chat_df["media_type"]) & pd.notnull(chat_df["action"]), "media_type"] = chat_df["action"]
    chat_df.loc[pd.notnull(chat_df["action"]), "from"] = chat_df["actor"]
if("live_location_period_seconds" in chat_df.columns):
    chat_df.loc[pd.isnull(chat_df["media_type"]) & pd.notnull(chat_df["live_location_period_seconds"]), "media_type"] = "live_location"
if("poll.question" in chat_df.columns):
   chat_df.loc[pd.isnull(chat_df["media_type"]) & pd.notnull(chat_df["poll.question"]), "media_type"] = "poll"
chat_df.loc[pd.isnull(chat_df["media_type"]), "media_type"] = "text"

In [None]:
# data cleaning: get word counts for text and photo media types
def get_word_count_str(text):
    return len([x for x in text.replace("\n"," ").split(" ") if x.strip()])

def get_word_count(message):
    count = 0
    if(type(message) is str):
        count += get_word_count_str(message)
    else:
        for text in message:
            if type(text) is str:
                count += get_word_count_str(text)
            else: 
                count += get_word_count_str(text["text"])
    return count

chat_df.loc[chat_df["media_type"].apply(lambda media: media in ["text","photo"]), "word_count"] = chat_df["text"].apply(get_word_count)

In [None]:
# data cleaning: drop irrelevant columns

columns_to_drop = ["id","type","date_unixtime","from_id","photo","sticker_emoji","width","height","via_bot","file","thumbnail","edited_unixtime"]
columns_to_drop += ["live_location_period_seconds","location_information.latitude","location_information.longitude"]
columns_to_drop += ["poll.question","poll.closed","poll.total_voters","poll.answers"]
columns_to_drop += ["actor", "actor_id", "action", "discard_reason", "message_id"]

chat_df.drop(columns=columns_to_drop, inplace=True, errors="ignore")
display(chat_df)

### Analysis: Messages

In [None]:
date_count_df = chat_df["date_only"].value_counts().reset_index(name="count").rename(columns={"index" : "date"}).sort_values(by="date")

fig = px.line(date_count_df, x="date", y="count")
fig.update_layout(title="Messages per day",
                    xaxis_title="date",
                    yaxis_title="messages")
fig.write_html('messages_per_day.html', auto_open=True)
fig.show()

In [None]:
month_count_df = chat_df["monthyear"].value_counts().reset_index(name="count").rename(columns={"index" : "month"}).sort_values(by="month")

fig = px.line(month_count_df, x="month", y="count")
fig.update_layout(title="Messages per month",
                    xaxis_title="date",
                    yaxis_title="messages")
fig.write_html('messages_per_month.html', auto_open=True)
fig.show()

In [None]:
# who texts more?
display(chat_df["from"].value_counts())

In [None]:
month_count_per_person_df = chat_df.groupby(["monthyear","from"]).size().to_frame().reset_index().rename(columns={0: "message_count"})
pivot_df = month_count_per_person_df.pivot(index="monthyear", columns="from", values="message_count").reset_index().rename_axis(None, axis=1)

fig = px.line(pivot_df, x="monthyear", y=chat_df["from"].unique())
fig.update_layout(title="Messages per person by month",
                    xaxis_title="month",
                    yaxis_title="messages",
                    legend=dict(title=""))
fig.write_html('messages_per_person_by_month.html', auto_open=True)
fig.show()

In [None]:
display(chat_df["media_type"].value_counts())

In [None]:
display(chat_df.groupby(["from","media_type"]).size())

In [None]:
text_message_df = chat_df[chat_df["media_type"].apply(lambda media: media in ["text","photo"])]
text_counts = text_message_df["from"].value_counts() # only messages with text content

word_count_df = text_message_df.groupby("from").sum().reset_index().sort_values(by="word_count",ascending=False)[["from","word_count"]]
word_count_df["text_count"] = word_count_df["from"].map(text_counts)
word_count_df["words_per_text"] = word_count_df["word_count"]/word_count_df["text_count"]
display(word_count_df)

### Analysis: Calls

In [None]:
# time spent on calls (hours)
chat_call_df = chat_df[chat_df["media_type"]=="phone_call"].groupby("date_only").sum().reset_index()

fig = px.line(chat_call_df, x="date_only", y="duration_minutes")
fig.update_layout(title="Call minutes per day",
                    xaxis_title="date",
                    yaxis_title="call_duration (minutes)")
fig.write_html('call_minutes_per_day.html', auto_open=True)
fig.show()

In [None]:
# time spent on calls (hours)
chat_call_df = chat_df[chat_df["media_type"]=="phone_call"].groupby("monthyear").sum().reset_index()

fig = px.line(chat_call_df, x="monthyear", y="duration_minutes")
fig.update_layout(title="Call minutes per month",
                    xaxis_title="date",
                    yaxis_title="call_duration (minutes)")
fig.write_html('call_minutes_per_month.html', auto_open=True)
fig.show()

### Analysis: Words

In [None]:
# sanitise text objects to strings for word cloud and word analysis
def sanitise_text(message):
    if(type(message) is str):
        return message
    else:
        sanitised = ""
        for text in message:
            if type(text) is str:
                sanitised += text.strip()
            else: 
                sanitised += text["text"].strip()
            sanitised += " "
    return sanitised.strip()
    
chat_df["sanitised_text"] = chat_df["text"].apply(sanitise_text)

In [None]:
# word cloud and word analysis prep
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt
import string

def generate_wordcloud(text, title):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=1200, height=600).generate(text)
    plt.figure(figsize=(15,10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.savefig("wordcloud_"+title+".png", bbox_inches="tight", pad_inches=0.4)
    print("word cloud ("+title+")")
    plt.show()

def generate_word_barchart(text, title):
    word_list = filter(lambda token: token != "", text.split(" "))
    fig = px.bar(pd.Series(word_list).value_counts()[:100])
    fig.update_layout(title="top 100 words ("+title+")",
                        xaxis=dict(title="word",dtick=1),
                        yaxis_title="count",
                        showlegend=False)
    fig.show()

In [None]:
# word cloud and word analysis
names = chat_df['from'].unique()
words = dict()
for name in names:
    # remove empty strings, whitespace, convert to lowercase, remove punctuation
    words[name] = " ".join(i for i in chat_df[chat_df["from"] == name]["sanitised_text"] if i).replace("\n"," ").lower().translate(str.maketrans('', '', string.punctuation))
    generate_wordcloud(words[name], name)
    generate_word_barchart(words[name], name)

all_words = " ".join(words.values())
generate_wordcloud(all_words, "combined")
generate_word_barchart(all_words, "combined")