In [47]:
import re
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

In [48]:
# Path to the uploaded WhatsApp chat file
file_path = 'WhatsApp Chat with ⛳गुरु गोविंद सिंह शाखा⛳.txt'

def parse_messages(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into messages using the date and time as separator
    messages = re.split(r'\n(?=\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[AP]M\s-\s)', content)

    # Remove any leading/trailing whitespace from messages
    messages = [msg.strip() for msg in messages if msg.strip()]

    return messages



In [49]:
def format_message(message):
    # Extract the date, time, and content from the message
    match = re.match(r'(\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s*[AP]M)\s-\s(.*)', message, re.DOTALL)
    if match:
        datetime_str, content = match.groups()
        # Replace non-ASCII characters with a standard space
        datetime_str = re.sub(r'[^\x00-\x7F]+', ' ', datetime_str)
        # Try parsing with both 2-digit and 4-digit year formats
        try:
            dt = datetime.strptime(datetime_str, '%m/%d/%y, %I:%M %p')
        except ValueError:
            try:
                dt = datetime.strptime(datetime_str, '%m/%d/%Y, %I:%M %p')
            except ValueError:
                return {'date_time': None, 'author': None, 'message': f"Unable to parse date: {datetime_str}\n{content}"}
        formatted_dt = dt.strftime('%Y-%m-%d %H:%M')
        # Check if it's a system message or a user message
        if ': ' in content:
            author, text = content.split(': ', 1)
            return {'date_time': formatted_dt, 'author': author, 'message': text}
        else:
            return {'date_time': formatted_dt, 'author': 'System', 'message': content}
    return {'date_time': None, 'author': None, 'message': message}




In [62]:
# Parse the chat
distinct_messages = parse_messages(file_path)

# Format messages and collect them in a list
formatted_messages = [format_message(msg) for msg in distinct_messages]

# Create a DataFrame
df = pd.DataFrame(formatted_messages, columns=['date_time', 'author', 'message'])

# Display the DataFrame
df

Unnamed: 0,date_time,author,message
0,2022-02-12 00:05,System,Messages and calls are end-to-end encrypted. N...
1,2022-06-30 22:23,Devidasji Sowane Mumbai,*जुलै मास का सांघिक गीत*\n\nनवचैतन्य हिलोरे ले...
2,2022-06-30 22:23,Devidasji Sowane Mumbai,<Media omitted>
3,2022-09-18 20:27,Ram Ji Umate,<Media omitted>
4,2022-10-05 19:11,Devidasji Sowane Mumbai,*यतो धर्म: स्ततो जय:*\n\n*जहाँ धर्म (अपने कर्त...
5,2022-10-05 23:13,Aage Vilas,<Media omitted>
6,2022-10-09 08:45,C.A.Rawat,<Media omitted>
7,2022-10-24 17:46,Ram Ji Umate,<Media omitted>
8,2022-10-24 18:26,Aage Vilas,<Media omitted>
9,2022-10-24 18:26,Aage Vilas,<Media omitted>


In [51]:
filtered_df = df[df['message'].str.contains('Aaj ki|आज की संख्या', case=False, na=False)]
# Permanently changes the pandas settings
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)
filtered_df.shape

(0, 3)

In [52]:
 print(df[df['date_time'].str.contains('2024-06-27 19:53', case=False, na=False)]['message'])

Series([], Name: message, dtype: object)


In [58]:
# Function to extract counts
def extract_counts(message):
    counts = {
        'shishu': 0,
        'bal': 0,
        'tarun': 0,
        'abhayagat': 0,
        'total': 0
    }
    patterns = {
        'shishu': r'शिशु\s*[-\s]*\s*(\d+)|Shishu\s*[-\s]*\s*(\d+)',
        'bal': r'बाल\s*[-\s]*\s*(\d+)|Bal\s*[-\s]*\s*(\d+)',
        'tarun': r'तरुण\s*[-\s]*\s*(\d+)|Tarun\s*[-\s]*\s*(\d+)',
        'abhayagat': r'अभ्यागत\s*[-\s]*\s*(\d+)|Abhyagat\s*[-\s]*\s*(\d+)',
        'total': r'कुल\s*[-\s]*\s*(\d+)|Total\s*[-\s]*\s*(\d+)'
    }
    for key, pattern in patterns.items():
        matches = re.findall(pattern, message)
        for match in matches:
            counts[key] += int(match[0] or match[1])
    return counts

# Apply the function to the 'message' column
filtered_df['counts'] = filtered_df['message'].apply(extract_counts)

# Expand the counts dictionary into separate columns
counts_df = filtered_df['counts'].apply(pd.Series)
counts_df.shape
# # Merge the counts_df with the filtered_df
# result_df = filtered_df[['date_time']].join(counts_df)

# print(result_df)


(0,)

In [56]:
# counts_df['total'].value_counts()

In [57]:
counts_df[counts_df['total']==0].shape


KeyError: 'total'

In [59]:
# Assuming you have already executed the previous code and have 'filtered_df' and 'counts_df' available

# Filter counts_df for rows where 'total' is 0
zero_total_counts = counts_df[counts_df['total'] == 0]

# Get indices of these rows
zero_total_indices = zero_total_counts.index

# Filter filtered_df based on these indices
zero_total_filtered_df = filtered_df.iloc[zero_total_indices]

print(zero_total_filtered_df)


KeyError: 'total'

In [60]:
final = pd.DataFrame()
final = pd.concat([filtered_df, counts_df], axis=1, join="inner")
final

Unnamed: 0,date_time,author,message,counts,counts.1


In [61]:
final.columns


Index(['date_time', 'author', 'message', 'counts', 'counts'], dtype='object')