# Getting Started

### Libraries Required

* numpy
* pandas
* matplotlib
* seaborn
* wordcloud
* emoji
* jovian (optional)

Install all the above libraries using the command : 
```
pip install numpy pandas matplotlib seaborn wordcloud emoji jovian --upgrade
```

In [None]:
import re
import jovian
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import emoji
from collections import Counter

### WhatsApp Chat Data

* Open WhatsApp
* Open a Group/Inbox
* Click on the 3 dotted options button
* Click on More
* Click on Export Chat
* Click on without media (we are analyzing only text messages here)
* Export via Email/other IM's/....
* Download to your system, rename to chat.txt and put it in a folder.

### Data Processing

In [None]:
def rawToDf(file, key):
    split_formats = {
        '12hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s[APap][mM]\s-\s',
        '24hr' : '\d{1,2}/\d{1,2}/\d{2,4},\s\d{1,2}:\d{2}\s-\s',
        'custom' : ''
    }
    datetime_formats = {
        '12hr' : '%m/%d/%y, %I:%M %p - ',
        '24hr' : '%m/%d/%y, %H:%M - ',
        'custom': ''
    }
    
    with open(file, 'r', encoding="utf8") as raw_data:
        raw_string = ' '.join(raw_data.read().split('\n')) # converting the list split by newline char. as one whole string as there can be multi-line messages
        user_msg = re.split(split_formats[key], raw_string) [1:] # splits at all the date-time pattern, resulting in list of all the messages with user names
        date_time = re.findall(split_formats[key], raw_string) # finds all the date-time patterns
        
        df = pd.DataFrame({'date_time': date_time, 'user_msg': user_msg}) # exporting it to a df
        
    # converting date-time pattern which is of type String to type datetime,
    # format is to be specified for the whole string where the placeholders are extracted by the method 
    df['date_time'] = pd.to_datetime(df['date_time'], format=datetime_formats[key])
    
    # split user and msg 
    usernames = []
    msgs = []
    for i in df['user_msg']:
        a = re.split('([\w\W]+?):\s', i) # lazy pattern match to first {user_name}: pattern and spliting it aka each msg from a user
        if(a[1:]): # user typed messages
            usernames.append(a[1])
            msgs.append(a[2])
        else: # other notifications in the group(eg: someone was added, some left ...)
            usernames.append("grp_notif")
            msgs.append(a[0])

    # creating new columns         
    df['user'] = usernames
    df['msg'] = msgs

    # dropping the old user_msg col.
    df.drop('user_msg', axis=1, inplace=True)
    
    return df

### Import Data

In [None]:
df = rawToDf('chat-data.txt', '12hr')

In [None]:
df.tail()

In [None]:
# no. of msgs
df.shape

In [None]:
me = "Ashutosh Krishna"

### Data Cleaning

Let's delete the messages having media. We can see above the media part is omitted.

In [None]:
media = df[df['msg']=="<Media omitted> "] #no. of images, images are represented by <media omitted>
media.shape

In [None]:
df["user"].unique()

In [None]:
grp_notif = df[df['user']=="grp_notif"] #no. of grp notifications
grp_notif.shape

In [None]:
df.drop(media.index, inplace=True) #removing images
df.drop(grp_notif.index, inplace=True) #removing grp_notif

In [None]:
df.tail()

In [None]:
df.reset_index(inplace=True, drop=True)
df.shape

# Let's Answer Some Questions

## Q. Who are the least active and most active persons in the group?

In [None]:
df.groupby("user")["msg"].count().sort_values(ascending=False)

## Q. How many emojis I have used?

In [None]:
emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df.iterrows():
    if row["user"] == me:
        emojis_found = r.findall(row["msg"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1

In [None]:
for item in emoji_ctr.most_common(10):
    print(item[0] + " - " + str(item[1]))

## Q. What does my WhatsApp activity tell about my sleep cycle?

In [None]:
df['hour'] = df['date_time'].apply(lambda x: x.hour)
df[df['user']==me].groupby(['hour']).size().sort_index().plot(x="hour", kind='bar')

## Let's take Week Days and Weekends into consideration

## Q. How many words do I type on average on weekday vs weekend?

In [None]:
df['weekday'] = df['date_time'].apply(lambda x: x.day_name()) # can use day_name or weekday from datetime 

In [None]:
df['is_weekend'] = df.weekday.isin(['Sunday', 'Saturday'])

In [None]:
msgs_per_user = df['user'].value_counts(sort=True)
msgs_per_user

Who are the top 5 message senders?

In [None]:
top5_users = msgs_per_user.index.tolist()[:5]
top5_users

In [None]:
df_top5 = df.copy()
df_top5 = df_top5[df_top5.user.isin(top5_users)]
df_top5.head()

In [None]:
plt.figure(figsize=(30,10))
sns.countplot(x="user", hue="weekday", data=df)

In [None]:
df_top5['is_weekend'] = df_top5.weekday.isin(['Sunday', 'Saturday'])

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x="user", hue="is_weekend", data=df_top5)

In [None]:
def word_count(val):
    return len(val.split())

In [None]:
df['no_of_words'] = df['msg'].apply(word_count)

In [None]:
df_top5['no_of_words'] = df_top5['msg'].apply(word_count)

Total words used in Weekdays

In [None]:
total_words_weekday = df[df['is_weekend']==False]['no_of_words'].sum()
total_words_weekday

Total words used in Weekends

In [None]:
total_words_weekend = df[df['is_weekend']]['no_of_words'].sum()
total_words_weekend

In [None]:
# average words on a weekday
total_words_weekday/5 

In [None]:
# average words on a weekend
total_words_weekend/2 

Number of words used by users in descending order

In [None]:
df.groupby('user')['no_of_words'].sum().sort_values(ascending=False)

In [None]:
(df_top5.groupby('user')['no_of_words'].sum()/df_top5.groupby('user').size()).sort_values(ascending=False)

In [None]:
wordPerMsg_weekday_vs_weekend = (df_top5.groupby(['user', 'is_weekend'])['no_of_words'].sum()/df_top5.groupby(['user', 'is_weekend']).size())
wordPerMsg_weekday_vs_weekend

In [None]:
wordPerMsg_weekday_vs_weekend.plot(kind='barh')

## Q. At what time of day do I use WhatsApp most?

In [None]:
x = df.groupby(['hour', 'weekday'])['msg'].size().reset_index()
x2 = x.pivot("hour", 'weekday', 'msg')
x2.head()

In [None]:
days = ["Monday", 'Tuesday', "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
sns.heatmap(x2[days].fillna(0), robust=True)

## Let's know whom did I respond the most in the group?

In [None]:
my_msgs_index = np.array(df[df['user']==me].index)
print(my_msgs_index, my_msgs_index.shape)

In [None]:
prev_msgs_index = my_msgs_index - 1
print(prev_msgs_index, prev_msgs_index.shape)

In [None]:
df_replies = df.iloc[prev_msgs_index].copy()
df_replies.shape

In [None]:
df_replies.groupby(["user"])["msg"].size().sort_values().plot(kind='barh')

In [None]:
comment_words = ' '
# stopwords = STOPWORDS.update([])
  
for val in df.msg.values: 
    val = str(val) 
    tokens = val.split() 
        
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
          
    for words in tokens: 
        comment_words = comment_words + words + ' '
  
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='black', 
#                 stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 

In [None]:
wordcloud.to_image()