In [None]:
import re
from collections import Counter
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
import arabic_reshaper
from bidi.algorithm import get_display

## How to obtain Whatsapp Chat data

* Open whatsapp 
* Open a Group/Inbox
* Click on the 3 dotted options button
* Click on more
* Click on export chat
* Click on without media 
* Export via Email/other IM's/....
* Download to your system rename to chat-data.txt and put it in a folder

![](https://i.imgur.com/KldS1n5.png)


```
Without media: exports 40k messages 
With media: exports 10k messages along with pictures/videos 
As im are doing chat data analysis i went with `without media` option 
```

## Data Preprocessing

* Regex cheatsheet
    * https://www.rexegg.com/regex-quickstart.html
* Regex test - live
    * https://regexr.com/
* Datetime format
    * http://strftime.org/
    
Use a custom a regex and datatime format by reffering to the above links if you run into empty df or format errors. As the exports from whatsapp are not standardized.

In [None]:
def raw_to_df(file, key):
    split_formats = {
        "12hr": "\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s",
        "24hr": "\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s",
        "custom": "",
    }
    datetime_formats = {
        "12hr": "%m/%d/%Y, %I:%M %p - ",
        "24hr": "%m/%d/%y, %H:%M - ",
        "custom": "",
    }
    # import codecs
    types_of_encoding = [
        "utf-8"
    ]  # "utf-8"#,"cp1252","utf8","cp850","utf-16-le","utf-32-le"]
    for encoding_type in types_of_encoding:
        with codecs.open(file, encoding=encoding_type, errors="strict") as raw_data:
            raw_string = " ".join(raw_data.read().split("\n"))
            user_msg = re.split(split_formats[key], raw_string)[1:]
            date_time = re.findall(split_formats[key], raw_string)
            df_ = pd.DataFrame({"date_time": date_time, "user_msg": user_msg})
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method
    df_["date_time"] = pd.to_datetime(df_["date_time"], format=datetime_formats[key])
    # split user and msg
    usernames = []
    msgs = []
    for element in df_["user_msg"]:
        matched_pattern_list = re.split(
            "([\w\W]+?):\s", element
        )  # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if matched_pattern_list[1:]:  # user typed messages
            usernames.append(matched_pattern_list[1])
            msgs.append(matched_pattern_list[2])
        else:  # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(matched_pattern_list[0])
    # creating new columns
    df_["user"] = usernames
    df_["msg"] = msgs
    # dropping the old user_msg col.
    df.drop("user_msg", axis=1, inplace=True)
    return df_

### Import data

In [None]:
file_ = input("Enter txt file name: ")
df = raw_to_df(file_, "24hr")

In [None]:
print(df.tail())

In [None]:
print(df.shape) # no of messages

In [None]:
usr_name = input("Enter user name to run analytics: ")

### Data Cleaning

In [None]:
images = df[
    df["msg"] == "<Media omitted> "
]  # no. of images, images are represented by <media omitted>
print(images.shape)

In [None]:
print(df["user"].unique())

In [None]:
grp_notif = df[df["user"] == "grp_notif"]  # no. of grp notifications
print(grp_notif.shape)

In [None]:
df.drop(images.index, inplace=True)  # removing images
df.drop(grp_notif.index, inplace=True)  # removing grp_notif

In [None]:
print(df.head())

In [None]:
print(df.tail())

In [None]:
df.reset_index(inplace=True, drop=True)
print(df.shape)

Most active / least active members of the group.

In [None]:
print(
    f"total messages per {df.groupby('user')['msg'].count().sort_values(ascending=False)}"
)

Emoji count

In [None]:
emoji_ctr = Counter()
emojis_list = map(lambda x: "".join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile("|".join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
    if row["user"] == usr_name:
        emojis_found = r.findall(row["msg"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1
for item in emoji_ctr.most_common(10):
    print(item[0] + " - " + str(item[1]))

Single user time series analysis

In [None]:
df["hour"] = df["date_time"].apply(lambda x: x.hour)
df[df["user"] == usr_name].groupby(["hour"]).size().sort_index().plot(
    x="hour", kind="bar", title=usr_name
)

User message count

In [None]:
df["weekday"] = df["date_time"].apply(
    lambda x: x.day_name()
)  # can use day_name or weekday from datetime
df["is_weekend"] = df.weekday.isin(["Sunday", "Saturday"])
msgs_per_user = df["user"].value_counts(sort=True)
print(msgs_per_user)

Top n Users

In [None]:
n_users = input("Enter how many top n users for analysis?: ")
n_users = int(n_users)
top_n_users = msgs_per_user.index.tolist()[:n_users]
print(top_n_users)

In [None]:
df_top_n = df.copy()
df_top_n = df_top_n[df_top_n.user.isin(top_n_users)]
plt.figure(figsize=(30, 10))
sns.countplot(x="user", hue="weekday", data=df_top_n)

Weekend vs Weekday Analysis

In [None]:
df_top_n["is_weekend"] = df_top_n.weekday.isin(["Sunday", "Saturday"])
plt.figure(figsize=(20, 10))
sns.countplot(x="user", hue="is_weekend", data=df_top_n)

In [None]:
def word_count(val_):
    return len(val_.split())

In [None]:
df["no_of_words"] = df["msg"].apply(word_count)

In [None]:
df_top_n["no_of_words"] = df_top_n["msg"].apply(word_count)

In [None]:
total_words_weekday = df[df["is_weekend"] is False]["no_of_words"].sum()
print(total_words_weekday)

In [None]:
total_words_weekend = df[df["is_weekend"]]["no_of_words"].sum()
print(total_words_weekend)

In [None]:
print(
    f"average words on a weekday: {total_words_weekday / 5}"
)  # average words on a weekday

In [None]:
print(
    f"average words on a weekend: {total_words_weekend / 2}"
)  # average words on a weekend

In [None]:
df.groupby("user")["no_of_words"].sum().sort_values(ascending=False)

In [None]:
(
    df_top_n.groupby("user")["no_of_words"].sum() / df_top_n.groupby("user").size()
).sort_values(ascending=False)

In [None]:
wordPerMsg_weekday_vs_weekend = (
    df_top_n.groupby(["user", "is_weekend"])["no_of_words"].sum()
    / df_top_n.groupby(["user", "is_weekend"]).size()
)
print(wordPerMsg_weekday_vs_weekend)

In [None]:
wordPerMsg_weekday_vs_weekend.plot(kind="barh")

## Q 5)
## Most Usage - Time of Day

In [None]:
msg_per_time = df.groupby(["hour", "weekday"])["msg"].size().reset_index()
pivot_msg_per_time = msg_per_time.pivot("hour", "weekday", "msg")
print(pivot_msg_per_time.head())

In [None]:
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.heatmap(pivot_msg_per_time[days].fillna(0), cmap="YlGnBu", robust=True)

## Q 6)
## In any group, do I have any inclination towards responding to someone? 

In [None]:
my_msgs_index = np.array(df[df["user"] == usr_name].index)
print(my_msgs_index, my_msgs_index.shape)

In [None]:
prev_msgs_index = my_msgs_index - 1
print(prev_msgs_index, prev_msgs_index.shape)

In [None]:
df_replies = df.iloc[prev_msgs_index].copy()
print(df_replies.shape)

In [None]:
df_replies.groupby(["user"])["msg"].size().sort_values().plot(kind="barh")

## Q 7)
## Which are the most common words?

In [None]:
COMMENT_WORDS = " "
stopwords = STOPWORDS.update(
    [
        "lo",
        "hai",
        "ge",
        "Lo",
        "illa",
        "yea",
        "ella",
        "en",
        "na",
        "En",
        "yeah",
        "alli",
        "ide",
        "okay",
        "ok",
        "will",
    ]
)

for val in df.msg.values:
    val = arabic_reshaper.reshape(str(val))
    val = get_display(val)
    tokens = val.split()
    for i in enumerate(tokens):
        tokens[i[0]] = tokens[i[0]].lower()
    for words in tokens:
        COMMENT_WORDS = COMMENT_WORDS + words + " "
wordcloud = WordCloud(
    font_path="arial",
    width=800,
    height=800,
    background_color="black",
    stopwords=stopwords,
    min_font_size=10,
).generate(COMMENT_WORDS)

In [None]:
wordcloud.to_image()