In [244]:
import pandas as pd
import numpy as np
import re

In [245]:
def startsWithDateAndTime(s):
    # regex pattern for date.(Works only for android. IOS Whatsapp export format is different. Will update the code soon
    #pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]), ([0-9]+):([0-9][0-9]) (AM|PM) -'
    pattern = '^([0-9]+)(\/)([0-9]+)(\/)([0-9][0-9]+), ([0-9]+):([0-9][0-9]) -'
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [246]:
# Finds username of any given format.
def FindAuthor(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([\w]+)[\u263a-\U0001f999]+:',    # Name and Emoji              
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False

In [247]:
def getDataPoint(line):   
    splitLine = line.split(' - ') 
    dateTime = splitLine[0]
    date, time = dateTime.split(', ') 
    message = ' '.join(splitLine[1:])
    if FindAuthor(message): 
        splitMessage = message.split(': ') 
        author = splitMessage[0] 
        message = ' '.join(splitMessage[1:])
    else:
        author = None
    return date, time, author, message

In [248]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
# Upload your file here
conversationPath = 'WhatsApp_Busy_School_friends_full.txt' # chat file
f= open(conversationPath, encoding="utf-8")
messageBuffer = [] 
date, time, author = None, None, None

for line in f:
    line = line.strip()
    
    if startsWithDateAndTime(line): 
        if len(messageBuffer) > 0: 
            parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
        messageBuffer.clear() 
        date, time, author, message = getDataPoint(line) 
        messageBuffer.append(message) 
    else:
        messageBuffer.append(line)
    
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
df["Date"] = pd.to_datetime(df["Date"])

In [249]:
df.head(100)

Unnamed: 0,Date,Time,Author,Message
0,2018-11-07,15:17,Rakesh Nagarkar,कॅंन्सर (आजिबात घाबरू नका ) हे नाव जरी ऐकलं त...
1,2018-11-07,19:06,Samita,"स्त्रिने एकदा,शिवजिला प्रसन्न केले शिवजी—मी प..."
2,2018-12-07,03:41,Samita,पैसै का दिखावा कितना मेंहगा पडा. दुल्हन 👰 और द...
3,2018-12-07,03:41,Samita,<Media omitted>
4,2018-12-07,05:31,Nutan Vbhs,*‼ॐ श्रीगुरुदेव दत्त‼* सर्व दरवाजे बंद झाले तर...
...,...,...,...,...
95,2018-01-09,03:23,Priti Vbhs,<Media omitted>
96,2018-01-09,03:39,Rakesh Nagarkar,<Media omitted>
97,2018-02-09,05:54,Rakesh Nagarkar,<Media omitted>
98,2018-02-09,16:32,Samita,identify the husband


In [250]:
df = df.dropna()

In [251]:
df.head(50)

Unnamed: 0,Date,Time,Author,Message
0,2018-11-07,15:17,Rakesh Nagarkar,कॅंन्सर (आजिबात घाबरू नका ) हे नाव जरी ऐकलं त...
1,2018-11-07,19:06,Samita,"स्त्रिने एकदा,शिवजिला प्रसन्न केले शिवजी—मी प..."
2,2018-12-07,03:41,Samita,पैसै का दिखावा कितना मेंहगा पडा. दुल्हन 👰 और द...
3,2018-12-07,03:41,Samita,<Media omitted>
4,2018-12-07,05:31,Nutan Vbhs,*‼ॐ श्रीगुरुदेव दत्त‼* सर्व दरवाजे बंद झाले तर...
5,2018-07-13,03:50,Priti Vbhs,<Media omitted>
6,2018-07-13,04:58,Rakesh Nagarkar,💐💐💐🌹🌹🌹💐💐💐 *एखाद्या अडचणीत देवाने तुमचे लगेच ऐ...
7,2018-07-13,05:57,Rakesh Nagarkar,"नातू आजीला म्हणतो --""आजी,आजी तू नाटकात काम करत..."
8,2018-07-13,15:15,Rakesh Nagarkar,"मला एक कळत नाही, संतूर साबणाच्या जाहिरातीत ल..."
9,2018-07-13,16:25,Milan Vbhs,*एक पिढी* *1971/1972/1973/1974/1975/1976/1977/...


In [252]:
df.Author.unique()

array(['Rakesh Nagarkar', 'Samita', 'Nutan Vbhs', 'Priti Vbhs',
       'Milan Vbhs', 'Varsha', 'Ajit Vanjare', 'Datta Kharade',
       'Prafull Shedge'], dtype=object)

In [253]:
df.groupby('Author').Message.count()

Author
Ajit Vanjare       103
Datta Kharade       36
Milan Vbhs          60
Nutan Vbhs          46
Prafull Shedge      37
Priti Vbhs         127
Rakesh Nagarkar    399
Samita             331
Varsha              92
Name: Message, dtype: int64

In [254]:
p_df.head(50)

Unnamed: 0_level_0,Time,Message
Author,Unnamed: 1_level_1,Unnamed: 2_level_1
Ajit Vanjare,05:3905:39,Happy birthday samitaMay god bless you
Datta Kharade,10:23,Happy Birthday samita 💐🎂🍫
Milan Vbhs,05:30,Happy Birthday dear Samita🥰💐🥳🥳
Nutan Vbhs,07:48,Happy birthday Samita 💐🎂
Prafull Shedge,14:3413:54,"Happy birthday Samita, Njoy!!💐🍨🥳🎂👏🏻👏🏻"
Priti Vbhs,13:1013:5408:57,Happy birthday dear sami🎂💄👗👜MastHappy birthday...
Rakesh Nagarkar,05:3713:3913:3904:59,Happy Birthday Dear Sami<Media omitted>Please ...
Samita,05:3805:4306:1309:0413:1914:3519:0419:0510:010...,Thank you RakeshThank you AjitThank you Varsha...
Varsha,06:0505:36,Happy birthday Samita🥳🍫💐🎂Happy birthday dear N...


In [255]:
import regex
import emoji

def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [256]:
total_messages = df.shape[0]
media_messages = df[df['Message'] == '<Media omitted>'].shape[0]
df["emoji"] = df["Message"].apply(split_count)
emojis = sum(df['emoji'].str.len())
URLPATTERN = r'(https?://\S+)'
df['urlcount'] = df.Message.apply(lambda x: re.findall(URLPATTERN, x)).str.len()
links = np.sum(df.urlcount)

In [257]:
df.head(50)

Unnamed: 0,Date,Time,Author,Message,emoji,urlcount
0,2018-11-07,15:17,Rakesh Nagarkar,कॅंन्सर (आजिबात घाबरू नका ) हे नाव जरी ऐकलं त...,[],4
1,2018-11-07,19:06,Samita,"स्त्रिने एकदा,शिवजिला प्रसन्न केले शिवजी—मी प...",[],0
2,2018-12-07,03:41,Samita,पैसै का दिखावा कितना मेंहगा पडा. दुल्हन 👰 और द...,[👰],0
3,2018-12-07,03:41,Samita,<Media omitted>,[],0
4,2018-12-07,05:31,Nutan Vbhs,*‼ॐ श्रीगुरुदेव दत्त‼* सर्व दरवाजे बंद झाले तर...,"[‼, ‼, ‼, ‼, 🙏🏻, 🌼, 🌼, 🙏🏻, 🌷, 🌷]",0
5,2018-07-13,03:50,Priti Vbhs,<Media omitted>,[],0
6,2018-07-13,04:58,Rakesh Nagarkar,💐💐💐🌹🌹🌹💐💐💐 *एखाद्या अडचणीत देवाने तुमचे लगेच ऐ...,"[💐, 💐, 💐, 🌹, 🌹, 🌹, 💐, 💐, 💐, 😴, 😴, 💐, 💐, 💐, 🌹, ...",0
7,2018-07-13,05:57,Rakesh Nagarkar,"नातू आजीला म्हणतो --""आजी,आजी तू नाटकात काम करत...","[😁, 😀, 😂, 🤣, 🤣, 🤣, 😃, 😃]",0
8,2018-07-13,15:15,Rakesh Nagarkar,"मला एक कळत नाही, संतूर साबणाच्या जाहिरातीत ल...","[😂, 😂, 😂, 😂, 😂, 😂, 😂]",0
9,2018-07-13,16:25,Milan Vbhs,*एक पिढी* *1971/1972/1973/1974/1975/1976/1977/...,"[🙏, 🙏]",0


In [258]:
media_messages_df = df[df['Message'] == '<Media omitted>']
messages_df = df.drop(media_messages_df.index)

In [259]:
messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))

In [260]:
messages_df.head(50)

Unnamed: 0,Date,Time,Author,Message,emoji,urlcount,Letter_Count,Word_Count
0,2018-11-07,15:17,Rakesh Nagarkar,कॅंन्सर (आजिबात घाबरू नका ) हे नाव जरी ऐकलं त...,[],4,1800,280
1,2018-11-07,19:06,Samita,"स्त्रिने एकदा,शिवजिला प्रसन्न केले शिवजी—मी प...",[],0,509,91
2,2018-12-07,03:41,Samita,पैसै का दिखावा कितना मेंहगा पडा. दुल्हन 👰 और द...,[👰],0,257,49
4,2018-12-07,05:31,Nutan Vbhs,*‼ॐ श्रीगुरुदेव दत्त‼* सर्व दरवाजे बंद झाले तर...,"[‼, ‼, ‼, ‼, 🙏🏻, 🌼, 🌼, 🙏🏻, 🌷, 🌷]",0,399,61
6,2018-07-13,04:58,Rakesh Nagarkar,💐💐💐🌹🌹🌹💐💐💐 *एखाद्या अडचणीत देवाने तुमचे लगेच ऐ...,"[💐, 💐, 💐, 🌹, 🌹, 🌹, 💐, 💐, 💐, 😴, 😴, 💐, 💐, 💐, 🌹, ...",0,283,47
7,2018-07-13,05:57,Rakesh Nagarkar,"नातू आजीला म्हणतो --""आजी,आजी तू नाटकात काम करत...","[😁, 😀, 😂, 🤣, 🤣, 🤣, 😃, 😃]",0,251,45
8,2018-07-13,15:15,Rakesh Nagarkar,"मला एक कळत नाही, संतूर साबणाच्या जाहिरातीत ल...","[😂, 😂, 😂, 😂, 😂, 😂, 😂]",0,165,31
9,2018-07-13,16:25,Milan Vbhs,*एक पिढी* *1971/1972/1973/1974/1975/1976/1977/...,"[🙏, 🙏]",0,2964,486
13,2018-07-16,18:14,Samita,🤵🏻 *मैनेजमेंट लेसन:* एक दिन एक कुत्ता 🐕 जंगल ...,"[🤵🏻, 🐕, 🦁, ☠, 🐒, 😃]",0,1797,422
14,2018-07-16,18:26,Milan Vbhs,👌🏼,[👌🏼],0,2,1


In [261]:
# Creates a list of unique Authors - ['Manikanta', 'Teja Kura', .........]
l = messages_df.Author.unique()

for i in range(len(l)):
  # Filtering out messages of particular user
  req_df= messages_df[messages_df["Author"] == l[i]]
  # req_df will contain messages of only one particular user
  print(f'Stats of {l[i]} -')
  # shape will print number of rows which indirectly means the number of messages
  print('Messages Sent', req_df.shape[0])
  #Word_Count contains of total words in one message. Sum of all words/ Total Messages will yield words per message
  words_per_message = (np.sum(req_df['Word_Count']))/req_df.shape[0]
  print('Words per message', words_per_message)
  #media conists of media messages
  media = media_messages_df[media_messages_df['Author'] == l[i]].shape[0]
  print('Media Messages Sent', media)
  # emojis conists of total emojis
  emojis = sum(req_df['emoji'].str.len())
  print('Emojis Sent', emojis)
  #links consist of total links
  links = sum(req_df["urlcount"])   
  print('Links Sent', links)   
  print()

Stats of Rakesh Nagarkar -
Messages Sent 232
Words per message 54.060344827586206
Media Messages Sent 167
Emojis Sent 813
Links Sent 10

Stats of Samita -
Messages Sent 180
Words per message 32.916666666666664
Media Messages Sent 151
Emojis Sent 357
Links Sent 7

Stats of Nutan Vbhs -
Messages Sent 35
Words per message 55.17142857142857
Media Messages Sent 11
Emojis Sent 75
Links Sent 0

Stats of Milan Vbhs -
Messages Sent 40
Words per message 123.625
Media Messages Sent 20
Emojis Sent 206
Links Sent 1

Stats of Priti Vbhs -
Messages Sent 92
Words per message 5.923913043478261
Media Messages Sent 35
Emojis Sent 109
Links Sent 0

Stats of Varsha -
Messages Sent 75
Words per message 4.8133333333333335
Media Messages Sent 17
Emojis Sent 81
Links Sent 1

Stats of Ajit Vanjare -
Messages Sent 95
Words per message 9.305263157894737
Media Messages Sent 8
Emojis Sent 36
Links Sent 1

Stats of Datta Kharade -
Messages Sent 32
Words per message 24.75
Media Messages Sent 4
Emojis Sent 103
Links S

In [262]:
from collections import Counter
from typing import Dict

In [263]:
total_emojis_list = list([a for b in messages_df.emoji for a in b])

In [264]:
total_emojis_list

['👰',
 '‼',
 '‼',
 '‼',
 '‼',
 '🙏🏻',
 '🌼',
 '🌼',
 '🙏🏻',
 '🌷',
 '🌷',
 '💐',
 '💐',
 '💐',
 '🌹',
 '🌹',
 '🌹',
 '💐',
 '💐',
 '💐',
 '😴',
 '😴',
 '💐',
 '💐',
 '💐',
 '🌹',
 '🌹',
 '🌹',
 '💐',
 '💐',
 '💐',
 '😁',
 '😀',
 '😂',
 '🤣',
 '🤣',
 '🤣',
 '😃',
 '😃',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '🙏',
 '🙏',
 '🤵🏻',
 '🐕',
 '🦁',
 '☠',
 '🐒',
 '😃',
 '👌🏼',
 '😘',
 '❣',
 '❣',
 '😘',
 '🙏',
 '👏',
 '👏',
 '👌',
 '🍂',
 '🌺',
 '🌺',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😂',
 '😄',
 '😅',
 '😄',
 '🌺',
 '☘',
 '🌺',
 '🙏',
 '🌺',
 '☘',
 '🌺',
 '🌷',
 '☘',
 '🌷',
 '☘',
 '🌷',
 '☘',
 '🌷',
 '🌹',
 '☘',
 '🌹',
 '☘',
 '🌹',
 '☘',
 '🌹',
 '🙏',
 '🌹',
 '🌹',
 '🙏',
 '🙏',
 '🙏',
 '💐',
 '💐',
 '💐',
 '💐',
 '🙏',
 '🙏',
 '🙏🏽',
 '🙏🏽',
 '🙏',
 '🙏',
 '🙏',
 '🙏',
 '🙏',
 '🙏',
 '🌹',
 '🔰',
 '🔰',
 '🔰',
 '🔰',
 '🔰',
 '😄',
 '😄',
 '😄',
 '🤔',
 '🤔',
 '😇',
 '😇',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '🔛',
 '👣',
 '👣',
 '👣',
 '👣',
 '🤣',
 '🤣',
 '🤣',
 '👍',
 '👍',
 '👆',
 '👌',
 '👌',
 '🌹',
 '🌹',
 '👇',
 '👇',
 '👇',
 '👇',
 '👇',
 '👇',
 '👇',
 '👇',
 '🙏'

In [265]:
emoji_dict = Counter(total_emojis_list)

In [266]:
emoji_dict

Counter({'👰': 1,
         '‼': 6,
         '🙏🏻': 34,
         '🌼': 2,
         '🌷': 6,
         '💐': 120,
         '🌹': 52,
         '😴': 2,
         '😁': 20,
         '😀': 22,
         '😂': 138,
         '🤣': 82,
         '😃': 59,
         '🙏': 60,
         '🤵🏻': 1,
         '🐕': 3,
         '🦁': 1,
         '☠': 1,
         '🐒': 1,
         '👌🏼': 2,
         '😘': 17,
         '❣': 2,
         '👏': 16,
         '👌': 43,
         '🍂': 1,
         '🌺': 25,
         '😄': 28,
         '😅': 11,
         '☘': 34,
         '🙏🏽': 2,
         '🔰': 5,
         '🤔': 18,
         '😇': 6,
         '🔛': 11,
         '👣': 5,
         '👍': 28,
         '👆': 6,
         '👇': 13,
         '🌧': 3,
         '🌦': 1,
         '🌸': 7,
         '©': 1,
         '😊': 37,
         '🙏🏼': 4,
         '🎂': 61,
         '🍫': 28,
         '👍🏼': 1,
         '🍇': 11,
         '🍒': 11,
         '😡': 4,
         '🙂': 2,
         '👇🏻': 3,
         '🙄': 7,
         '😉': 11,
         '😜': 57,
         '😆': 20,
         '💃

In [267]:
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)
emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df

Unnamed: 0,emoji,count
0,😂,138
1,💐,120
2,🤣,82
3,🎂,61
4,🙏,60
...,...,...
305,🧜🏻‍♀,1
306,🤨,1
307,🥴,1
308,💄,1


In [277]:
df_emoji = messages_df[['Author','emoji']]

In [278]:
df_emoji.head(50)

Unnamed: 0,Author,emoji
0,Rakesh Nagarkar,[]
1,Samita,[]
2,Samita,[👰]
4,Nutan Vbhs,"[‼, ‼, ‼, ‼, 🙏🏻, 🌼, 🌼, 🙏🏻, 🌷, 🌷]"
6,Rakesh Nagarkar,"[💐, 💐, 💐, 🌹, 🌹, 🌹, 💐, 💐, 💐, 😴, 😴, 💐, 💐, 💐, 🌹, ..."
7,Rakesh Nagarkar,"[😁, 😀, 😂, 🤣, 🤣, 🤣, 😃, 😃]"
8,Rakesh Nagarkar,"[😂, 😂, 😂, 😂, 😂, 😂, 😂]"
9,Milan Vbhs,"[🙏, 🙏]"
13,Samita,"[🤵🏻, 🐕, 🦁, ☠, 🐒, 😃]"
14,Milan Vbhs,[👌🏼]


In [283]:
#explode function to convert list to rows
df_emoji_1 = df_emoji.explode('emoji')

#Alternate way to convert list to rows
#(df_emoji['emoji'].apply(lambda x: pd.Series(x))
#            .stack()
#            .reset_index(level=1, drop=True)
#            .to_frame('emoji')
#            .join(df[['Author']], how='left')
#)

In [284]:
df_emoji_1.head(80)

Unnamed: 0,Author,emoji
0,Rakesh Nagarkar,
1,Samita,
2,Samita,👰
4,Nutan Vbhs,‼
4,Nutan Vbhs,‼
...,...,...
23,Samita,😂
23,Samita,😂
23,Samita,😄
23,Samita,😅


In [286]:
df_emoji_1.groupby(['Author','emoji'])['emoji'].count()

Author        emoji
Ajit Vanjare  ☀        1
              ☺        2
              ✨        2
              ⭐        1
              🌹        1
                      ..
Varsha        🙏🏼       2
              🤪        3
              🥃        1
              🥳        5
              🧁        1
Name: emoji, Length: 516, dtype: int64