In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import local

chat_file_path = local.local_path

with open(chat_file_path, encoding='utf-8') as file:
    lines = file.readlines()

phone_to_name = {
    '@543534534534': 'Salman',
    '@999999999999': 'Alexander',
}

aliases = {
    'Salman': ['Sal', 'Salboy'],
    'Alexander': ['Alex', 'Sasha'],
}

messages = []
pattern = r'^(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}\s?[APap][Mm]) - ([^:]+): (.*)$'

mention_counts = {}
mentions_over_time = []

for line in lines:
    if any(skip_text in line for skip_text in [
        'Messages and calls are end-to-end encrypted',
        'joined from the community',
        'created group',
        'added',
        'left',
        'now an admin'
    ]):
        continue
    match = re.match(pattern, line)
    if match:
        date_str, time_str, sender, message = match.groups()
        date_time_str = f"{date_str} {time_str}"
        date_time_str = re.sub(r'\s+', ' ', date_time_str)
        try:
            date_time = datetime.strptime(date_time_str, '%m/%d/%y %I:%M %p')
        except ValueError:
            date_time = datetime.strptime(date_time_str, '%m/%d/%Y %I:%M %p')

        if sender in phone_to_name:
            sender = phone_to_name[sender]

        for phone, name in phone_to_name.items():
            if phone in message:
                message = message.replace(f'@{phone}', f'@{name}')
                if name in mention_counts:
                    mention_counts[name] += 1
                else:
                    mention_counts[name] = 1
                mentions_over_time.append({'date': date_time.date(), 'name': name})
            elif name in message:
                if name in mention_counts:
                    mention_counts[name] += 1
                else:
                    mention_counts[name] = 1
                mentions_over_time.append({'date': date_time.date(), 'name': name})
            elif name in aliases:
                for alias in aliases[name]:
                    if alias in message:
                        if name in mention_counts:
                            mention_counts[name] += 1
                        else:
                            mention_counts[name] = 1
                        mentions_over_time.append({'date': date_time.date(), 'name': name})

        messages.append([date_time, sender, message])

chat_df = pd.DataFrame(messages, columns=['datetime', 'sender', 'message'])

start_time = datetime.strptime('11/19/24 3:22 PM', '%m/%d/%y %I:%M %p')
end_time = datetime.strptime('11/24/24 8:01 PM', '%m/%d/%y %I:%M %p')
chat_df = chat_df[(chat_df['datetime'] >= start_time) & (chat_df['datetime'] <= end_time)]

chat_df = chat_df[chat_df['sender'] != 'Welcome to the group']

chat_df['datetime'] = pd.to_datetime(chat_df['datetime'])

chat_df['date'] = chat_df['datetime'].dt.date
messages_per_day = chat_df.groupby('date').size().reset_index(name='message_count')

messages_per_day['date'] = pd.to_datetime(messages_per_day['date'])

if not messages_per_day.empty:
    plt.figure(figsize=(10, 6))
    plt.plot(messages_per_day['date'], messages_per_day['message_count'], marker='o')
    plt.xlabel('Date')
    plt.ylabel('Number of Messages')
    plt.title('Messages Over Time')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.grid(True)
    plt.show()
else:
    print("No messages found to plot for 'Messages Over Time'.")

mentions_df = pd.DataFrame(mentions_over_time)

if not mentions_df.empty:
    mentions_per_day = mentions_df.groupby(['date', 'name']).size().reset_index(name='mention_count')
    mentions_per_day_pivot = mentions_per_day.pivot(index='date', columns='name', values='mention_count').fillna(0)

    plt.figure(figsize=(12, 8))
    for name in mentions_per_day_pivot.columns:
        plt.plot(mentions_per_day_pivot.index, mentions_per_day_pivot[name], marker='o', label=name)
    plt.xlabel('Date')
    plt.ylabel('Number of Mentions')
    plt.title('Mentions Over Time by Day for Each Person')
    plt.xticks(rotation=45)
    plt.legend(title='Name', loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.grid(True)
    plt.show()
else:
    print("No mentions found to plot for 'Mentions Over Time by Day for Each Person'.")


In [None]:

messages_over_time_by_person = chat_df.groupby(['date', 'sender']).size().unstack(fill_value=0)

if not messages_over_time_by_person.empty:
    plt.figure(figsize=(12, 8))
    for sender in messages_over_time_by_person.columns:
        plt.plot(messages_over_time_by_person.index, messages_over_time_by_person[sender], marker='o', label=sender)
    plt.xlabel('Date')
    plt.ylabel('Number of Messages')
    plt.title('Messages Over Time by Person')
    plt.xticks(rotation=45)
    plt.legend(title='Sender')
    plt.tight_layout()
    plt.grid(True)
    plt.show()
else:
    print("No messages found to plot for 'Messages Over Time by Person'.")



In [None]:
import nltk
from collections import Counter
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_and_tokenize(message):
    message = message.translate(str.maketrans('', '', string.punctuation))
    message = message.lower()
    words = message.split()
    words = [word for word in words if word not in stop_words]
    return words

all_words = []
for message in chat_df['message']:
    all_words.extend(clean_and_tokenize(message))

word_counts = Counter(all_words)

most_common_words = word_counts.most_common(20)

print(most_common_words)
words, counts = zip(*most_common_words)
plt.figure(figsize=(12, 8))
plt.bar(words, counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Words (Excluding Stop Words)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Most Common Words')
plt.show()

In [None]:
from textblob import TextBlob
import matplotlib.pyplot as plt

def get_sentiment(message):
    return TextBlob(message).sentiment.polarity

chat_df['sentiment'] = chat_df['message'].apply(get_sentiment)

average_sentiment_per_person = chat_df.groupby(['date', 'sender'])['sentiment'].mean().unstack(fill_value=0)

if not average_sentiment_per_person.empty:
    plt.figure(figsize=(12, 8))
    for sender in average_sentiment_per_person.columns:
        plt.plot(average_sentiment_per_person.index, average_sentiment_per_person[sender], marker='o', label=sender)
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment')
    plt.title('Average Sentiment Over Time by Person')
    plt.xticks(rotation=45)
    plt.legend(title='Sender', loc='upper left', bbox_to_anchor=(1, 1))
    plt.tight_layout()
    plt.grid(True)
    plt.show()
else:
    print("No sentiment data found to plot for 'Average Sentiment Over Time by Person'.")


most_negative_messages = chat_df.nsmallest(20, 'sentiment')

print("Most Negative Sentiment Messages:")
for index, row in most_negative_messages.iterrows():
    print(f"Sender: {row['sender']}, Date: {row['datetime']}, Sentiment: {row['sentiment']}")
    print(f"Message: {row['message']}")
    print()



In [None]:
messages_per_sender = chat_df['sender'].value_counts().reset_index()
messages_per_sender.columns = ['sender', 'message_count']

plt.figure(figsize=(12, 8))
messages_per_sender_sorted = messages_per_sender.sort_values(by='message_count', ascending=False)
plt.bar(messages_per_sender_sorted['sender'], messages_per_sender_sorted['message_count'], color='skyblue')
plt.xlabel('Sender')
plt.ylabel('Number of Messages')
plt.title('Distribution of Messages Sent by Each Person')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
def is_all_caps(message):
    return message.isupper()

all_caps_counts = chat_df[chat_df['message'].apply(is_all_caps)].groupby('sender').size().reset_index(name='all_caps_count')

all_caps_counts_sorted = all_caps_counts.sort_values(by='all_caps_count', ascending=True)

print(all_caps_counts_sorted)

plt.figure(figsize=(12, 8))
plt.bar(all_caps_counts_sorted['sender'], all_caps_counts_sorted['all_caps_count'], color='skyblue')
plt.xlabel('Sender')
plt.ylabel('Number of All Caps Messages')
plt.title('Number of All Caps Messages per Person (Sorted)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

all_caps_messages = chat_df[chat_df['message'].apply(is_all_caps)]
for sender, group in all_caps_messages.groupby('sender'):
    print(f"All caps messages from {sender}:")
    for message in group['message']:
        print(f" - {message}")
    print()

In [None]:
import emoji
from collections import Counter
import matplotlib.pyplot as plt

def extract_emojis(message):
    return [char for char in message if char in emoji.EMOJI_DATA]

all_emojis = []
for message in chat_df['message']:
    all_emojis.extend(extract_emojis(message))

emoji_counts = Counter(all_emojis)
most_common_emojis = emoji_counts.most_common(20)

print(most_common_emojis)

emojis, counts = zip(*most_common_emojis)
plt.figure(figsize=(12, 8))
bars = plt.bar(emojis, counts, color='skyblue')

plt.rcParams['font.family'] = 'Segoe UI Emoji'  

for bar, emoji_char in zip(bars, emojis):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), emoji_char, ha='center', va='bottom', fontsize=20)

plt.xlabel('Emojis')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Emojis')
plt.tight_layout()
plt.show()
