In [None]:
import regex
import pandas as pd
import numpy as np
import emoji
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.express as px

In [None]:
def date_time(s):
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9]+), ([0-9]+):([0-9]+)[ ]?(AM|PM|am|pm)? -'
    result = regex.match(pattern, s)
    if result:
        return True
    return False

def find_author(s):
    s = s.split(":")
    if len(s)==2:
        return True
    else:
        return False
    
def getDataPoint(line):
    splitLine = line.split(' - ') 
    dateTime = splitLine[0]
    date, time = dateTime.split(', ') 
    message = ' '.join(splitLine[1:])
    if find_author(message): 
        splitMessage = message.split(': ') 
        author = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        author = None
    return date, time, author, message

In [None]:
data=[]
conversation = 'WhatsApp Chat - Kashish Vit.txt'
with open(conversation, encoding="utf-8") as fp:
    fp.readline() 
    messageBuffer = [] 
    date, time, author = None, None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip() 
        if date_time(line): 
            if len(messageBuffer) > 0: 
                data.append([date, time, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear() 
            date, time, author, message = getDataPoint(line) 
            messageBuffer.append(message) 
        else:
            messageBuffer.append(line)

In [None]:
df=pd.DataFrame(data, columns=['Date', 'Time', 'Author', 'Message'])
df["Date"] = pd.to_datetime(df["Date"])
print(df.tail(20))
print(df.info())
print(df.Author.unique())

In [None]:
total_messages = df.shape[0]
print(total_messages)

In [None]:
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
print(media_messages)

In [None]:
def split_count(text):
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)
    return emoji_list

df['emoji'] = df["Message"].apply(split_count)
total_emojis = sum(df['emoji'].str.len())
print(total_emojis)

In [None]:
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: regex.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)

In [None]:
print("Total Messages:", total_messages)
print("Total Media Shared:", media_messages)
print("Total Emojis Used:", total_emojis)
print("Total Links Shared:", links)

In [None]:
media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
messages_df["MessageCount"]=1

l = messages_df.groupby("Author")["MessageCount"].sum()
for i in range(len(l)):
    req_df = messages_df[messages_df["Author"] == l.index[i]]
    print(f'Stats of {l.index[i]} -')
    print('Messages Sent', l.values[i])
    print('Media Messages Sent', media_messages_df[media_messages_df['Author'] == l.index[i]].shape[0])
    print('Emojis Sent', sum(req_df['emoji'].str.len()))
    print('Links Sent', sum(req_df["urlcount"]))

In [None]:
total_emojis_list = list(set([a for b in messages_df.emoji for a in b]))
total_emojis = len(total_emojis_list)

total_emojis_list = list([a for b in messages_df.emoji for a in b])
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
for i in emoji_dict:
    print(i)

emojis_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
fig=px.pie(emojis_df, values='count', names='emoji')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
text = " ".join(review for review in messages_df.Message)
print ("There are {} words in all the messages.".format(len(text)))

stopwords= set(STOPWORDS)
wordcloud= WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.figure(figsize=(10,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [None]:
l = messages_df.Author.unique()
for i in range(len(l)):
    dummy_df = messages_df[messages_df['Author'] == l[i]]
    text = " ".join(review for review in dummy_df.Message)
    stopwords= set(STOPWORDS)
    print(f'Author: {l[i]}')
    wordcloud= WordCloud(stopwords=stopwords, background_color="white").generate(text)
    plt.figure(figsize=(10,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()