# Imports

In [None]:
pip install -r requirements.txt

# Preprocessing

In [None]:
import re
import pandas as pd

def preprocess(data):
    pattern = '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s'

    messages = re.split(pattern, data)[1:]
    dates = re.findall(pattern, data)

    df = pd.DataFrame({'user_message': messages, 'message_date': dates})
    # convert message_date type
    df['message_date'] = pd.to_datetime(df['message_date'], format='%d/%m/%Y, %H:%M - ')

    df.rename(columns={'message_date': 'date'}, inplace=True)

    users = []
    messages = []
    for message in df['user_message']:
        entry = re.split('([\w\W]+?):\s', message)
        if entry[1:]:  # user name
            users.append(entry[1])
            messages.append(" ".join(entry[2:]))
        else:
            users.append('group_notification')
            messages.append(entry[0])

    df['user'] = users
    df['message'] = messages
    df.drop(columns=['user_message'], inplace=True)

    df['only_date'] = df['date'].dt.date
    df['year'] = df['date'].dt.year
    df['month_num'] = df['date'].dt.month
    df['month'] = df['date'].dt.month_name()
    df['day'] = df['date'].dt.day
    df['day_name'] = df['date'].dt.day_name()
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute

    period = []
    for hour in df[['day_name', 'hour']]['hour']:
        if hour == 23:
            period.append(str(hour) + "-" + str('00'))
        elif hour == 0:
            period.append(str('00') + "-" + str(hour + 1))
        else:
            period.append(str(hour) + "-" + str(hour + 1))

    df['period'] = period

    return df

# Helper Functions

In [None]:
from urlextract import URLExtract
from wordcloud import WordCloud
import pandas as pd
from collections import Counter
import emoji

extract = URLExtract()

def print_line():
    print("-------------------------------------------------------------------")

def emoji_helper(selected_user, df):
  if selected_user != 'Overall':
    df = df[df['user'] == selected_user]

  emojis = []
  for message in df['message']:
    emojis.extend([c for c in message if c in emoji.EMOJI_DATA.keys()])

  emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis))))

  return emoji_df

def fetch_stats(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    # fetch the number of messages
    num_messages = df.shape[0]

    # fetch the total number of words
    words = []
    for message in df['message']:
        words.extend(message.split())

    # fetch number of media messages
    num_media_messages = df[df['message'] == '<Media omitted>\n'].shape[0]

    # fetch number of links shared
    links = []
    for message in df['message']:
        links.extend(extract.find_urls(message))

    return num_messages,len(words),num_media_messages,len(links)

def time_to_reply(selected_user,df):

    if selected_user == "Overall":
        return 0

    df['date'] = pd.to_datetime(df['date'])

    # Sort the DataFrame by 'date'
    df = df.sort_values(by='date')

    # Calculate the time difference between messages from different users
    df['time_diff'] = df['date'].diff()

    # Filter out the messages from 'group_notification' and reset the index
    df_without_grp_notif = df[df['user'] != 'group_notification'].reset_index(drop=True)

    # Calculate the time taken to reply between different users
    reply_time = df_without_grp_notif[['user', 'time_diff']]

    # print(reply_time.head(20))

    # Remove consecutive occurrences of the same user and keep only the first occurrence
    mask = reply_time['user'] != reply_time['user'].shift()
    filtered_df = reply_time[mask]

    # Reset the index
    filtered_df.reset_index(drop=True, inplace=True)
    filtered_df['time_diff'] = pd.to_timedelta(filtered_df['time_diff'])

    # print(filtered_df.head(50))

    mean_time_diff = filtered_df.groupby('user')['time_diff'].mean()
    mean_time_diff = mean_time_diff[selected_user]

    return mean_time_diff

def format_time_diff(mean_time_diff):

    # Convert mean_time_diff to hours, minutes, and seconds
    mean_time_diff_hours = mean_time_diff.seconds // 3600
    mean_time_diff_minutes = (mean_time_diff.seconds % 3600) // 60
    mean_time_diff_seconds = mean_time_diff.seconds % 60

    formatted_mean_time_diff = f"{mean_time_diff_minutes:02d} Minutes : {mean_time_diff_seconds:02d} Seconds"

    return formatted_mean_time_diff

def most_busy_users(df):
    x = df['user'].value_counts().head()
    df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename(
        columns={'index': 'name', 'user': 'percent'})
    return x,df

def create_wordcloud(selected_user,df):

    f = open('stop_words.txt', 'r')
    stop_words = f.read()

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    temp = df[df['user'] != 'group_notification']
    temp = temp[temp['message'] != '<Media omitted>\n']

    def remove_stop_words(message):
        y = []
        for word in message.lower().split():
            if word not in stop_words:
                y.append(word)
        return " ".join(y)

    wc = WordCloud(width=1000,height=1000,min_font_size=10,background_color='white')
    temp['message'] = temp['message'].apply(remove_stop_words)
    df_wc = wc.generate(temp['message'].str.cat(sep=" "))
    return df_wc

def most_common_words(selected_user,df):

    f = open('stop_words.txt','r')
    stop_words = f.read()

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    temp = df[df['user'] != 'group_notification']
    temp = temp[temp['message'] != '<Media omitted>\n']

    words = []

    for message in temp['message']:
        for word in message.lower().split():
            if word not in stop_words:
                words.append(word)

    most_common_df = pd.DataFrame(Counter(words).most_common(20))
    return most_common_df

def monthly_timeline(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    timeline = df.groupby(['year', 'month_num', 'month']).count()['message'].reset_index()

    time = []
    for i in range(timeline.shape[0]):
        time.append(timeline['month'][i] + "-" + str(timeline['year'][i]))

    timeline['time'] = time

    return timeline

def daily_timeline(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    daily_timeline = df.groupby('only_date').count()['message'].reset_index()

    return daily_timeline

def week_activity_map(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    return df['day_name'].value_counts()

def month_activity_map(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    return df['month'].value_counts()

def activity_heatmap(selected_user,df):

    if selected_user != 'Overall':
        df = df[df['user'] == selected_user]

    user_heatmap = df.pivot_table(index='day_name', columns='period', values='message', aggfunc='count').fillna(0)

    return user_heatmap

# Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Specifying the path to your local .txt file
chat_file_path = "./chat.txt"
group_chat_file_path = "./group_chat.txt"

# Reading the content of the file
with open(chat_file_path, 'r', encoding='utf-8') as file:
    chat_data = file.read()

with open(group_chat_file_path, 'r', encoding='utf-8') as file:
    group_chat_data = file.read()

df_chat = preprocess(chat_data)
df_group_chat = preprocess(group_chat_data)

# Preprocessing
df_chat['user'] = df_chat['user'].replace("<My Crush's Name>", "Crush")
df_group_chat['user'] = df_group_chat['user'].replace("Manas Bsnl", "Manas Pratim Biswas")
df_group_chat['user'] = df_group_chat['user'].replace("<My Crush's Name>", "Crush")
df_group_chat['message'] = df_group_chat['message'].str.replace("<My Crush's Name>", "RP", regex=False)

# print(df_chat.head(10))
# print(df_group_chat.head(10))

df = pd.concat([df_chat, df_group_chat])
# print(df.head(20))

# selected_user = input("Enter the user : Manas Pratim Biswas, Crush or Overall... ")

selected_user = ["Manas Pratim Biswas", "Crush", "Overall"]

# Overall Statistics 
for current_user in selected_user:
    num_messages, words, num_media_messages, num_links = fetch_stats(current_user,df)
    avg_word_per_msg = words/num_messages
    total_emojis = emoji_helper(current_user,df)[1].sum()
    unique_emojis = len(emoji_helper(current_user,df))
    most_used_emoji = emoji_helper(current_user,df)[0][0]
    reply_time = time_to_reply(current_user,df)

    if(current_user=="Overall"):
        reply_time = (time_to_reply("Manas Pratim Biswas",df) + time_to_reply("Crush",df))/2

    print(f"Statistics for {current_user}: ")
    print(f"Total Messages : {num_messages}")
    print(f"Total Words : {words}")
    print(f"Average words per message : {avg_word_per_msg:.3f}")
    print(f"Mean time taken to reply : {format_time_diff(reply_time)}")
    print(f"Total emojis : {total_emojis}")
    print(f"Total unique emojis : {unique_emojis}")
    print(f"Most used emoji : {most_used_emoji}")
    print(f"Total Media : {num_media_messages}")
    print(f"Total Links : {num_links}")
    print_line()

# Statistical Plots
for current_user in selected_user:

    # monthly timeline
    timeline = monthly_timeline(current_user,df)
    fig,ax = plt.subplots()
    ax.plot(timeline['time'], timeline['message'],color='green', label=current_user)
    plt.xticks(rotation=45)
    plt.title(f"Monthly Timeline {current_user}")
    ax.legend()
    plt.savefig(f"./plots/{current_user}_monthly.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:

    # daily timeline
    daily_timeline_df = daily_timeline(current_user, df)
    fig, ax = plt.subplots()
    ax.plot(daily_timeline_df['only_date'], daily_timeline_df['message'], color='black', label=current_user)
    plt.xticks(rotation=45)
    plt.title(f"Daily Timeline {current_user}")
    # ax.legend()
    plt.savefig(f"./plots/{current_user}_daily.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:

    # activity map
    busy_day = week_activity_map(current_user,df)
    fig,ax = plt.subplots()
    ax.bar(busy_day.index,busy_day.values,color='purple',label=current_user)
    plt.xticks(rotation=45)
    plt.title(f"Most busy day {current_user}")
    # ax.legend()
    plt.savefig(f"./plots/{current_user}_week_activity.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:
   
    busy_month = month_activity_map(current_user, df)
    fig, ax = plt.subplots()
    ax.bar(busy_month.index, busy_month.values,color='orange',label=current_user)
    plt.xticks(rotation=45)
    plt.title(f"Most busy month {current_user}")
    # ax.legend()
    plt.savefig(f"./plots/{current_user}_month_activity.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:

    user_heatmap = activity_heatmap(current_user,df)
    fig,ax = plt.subplots()
    ax = sns.heatmap(user_heatmap,label=current_user)
    plt.title(f"Weekly Activity Map {current_user}")
    # ax.legend()
    plt.savefig(f"./plots/{current_user}_heatmap.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:

    # WordCloud
    df_wc = create_wordcloud(current_user,df)
    fig,ax = plt.subplots()
    ax.imshow(df_wc)
    plt.title(f"Wordcloud {current_user}")
    plt.savefig(f"./plots/{current_user}_wordcloud.png", dpi=2400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:

    # most common words
    most_common_df = most_common_words(current_user,df)

    fig,ax = plt.subplots()

    ax.barh(most_common_df[0],most_common_df[1])
    plt.xticks(rotation=45)
    plt.title(f"Most commmon words {current_user}")
    # ax.legend()
    plt.savefig(f"./plots/{current_user}_frequent_words.png", dpi=6400, bbox_inches='tight')
    plt.show()

for current_user in selected_user:
    # emoji analysis
    emoji_df = emoji_helper(current_user,df)
    emoji_df = emoji_df.rename(columns={0: 'Emoji'})
    emoji_df = emoji_df.rename(columns={1: 'Frequency'})
    print(f"Top 25 Emojis {current_user}")
    print(emoji_df.head(25))
    print_line()
    
    