**IMPORT DATA FILE**

In [None]:
#pie chart of chat breakdown

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("Done!")

**IMPORT LIBRARIES**

In [None]:
from collections import Counter
import pandas as pd
import re
import datetime
import numpy as np
import matplotlib.pyplot as plt
from dateutil import tz


print("Done!")

In [None]:
!pip install emoji --upgrade

In [None]:
from emoji import UNICODE_EMOJI
print('Done!')

In [None]:
with open('/content/drive/My Drive/Colab Notebooks/Data/_chat.txt', 'r', encoding = 'utf-8') as file:
    data_lines = [line.strip() for line in file.read().splitlines()]
    data = pd.DataFrame(
        re.findall(r'\[(.*?),\s(.*?)]\s*([^:]+):\s*(.*)', '\n'.join(data_lines)),
        columns=['Date', 'Time', 'Author', 'Message']
    )
    
    print("Done!")

Done!


**Process documents**

In [None]:
documents_list = data['Message'].apply(lambda x: str(re.findall('.*document omitted', x)))
documents_list = documents_list.to_frame()
documents = documents_list['Message'].unique()
documents = [doc for doc in documents if len(doc) > 2]
documents = [doc.split("\\u200")[0].strip()[2:] for doc in documents]

**Big Analysis**

In [None]:
number_of_messages = data.shape[0]
number_of_pictures = data[data['Message'] == '‎image omitted'].shape[0]
number_of_videos = data[data['Message'] == '‎video omitted'].shape[0]
number_of_stickers = data[data['Message'] == '‎sticker omitted'].shape[0]
number_of_GIFs = data[data['Message'] == '‎GIF omitted'].shape[0]
number_of_documents = len(documents)
number_of_deleted_messages = data[data['Message'] == "‎This message was deleted."].shape[0] + data[data['Message'] == '‎You deleted this message.'].shape[0]

start_date = datetime.datetime.strptime(data['Date'][0], "%d/%m/%y").date()
end_date = datetime.datetime.strptime(data['Date'][number_of_messages - 1], "%d/%m/%y").date()

duration = end_date - start_date
duration = duration.days
days_chatted = data['Date'].unique().shape[0]
print('Chat data start and end dates:', start_date, ':', end_date, '(yyyy-mm-dd)')
print('Chat duration is: ', duration, 'days')
print('Active days:', days_chatted, '/', duration, 'or {:.2f}%'.format(days_chatted/duration*100) ,'days of duration')

print("\nThere are {} participants in this chat: ".format(data['Author'].unique().size))
names = [name for name in data['Author'].unique()]
for name in names:
    print(name)

df1 = data[data['Author'] == names[0]]
df2 = data[data['Author'] == names[1]]

number_text_message = number_of_messages - number_of_pictures - number_of_GIFs - number_of_videos - number_of_stickers - number_of_documents
number_text_message

print('\nThey sent a total of {:,} messages:'.format(number_of_messages))
print('\n{:,} of which were text messages, or {:.2f}% of the total {} messages.'.format(number_text_message,number_text_message/number_of_messages*100, number_of_messages))
print('{} of which were pictures, or {:.3f}% of the total {} messages.'.format(number_of_pictures,number_of_pictures/number_of_messages*100, number_of_messages))
print('{} of which were videos, or {:.3f}% of the total {} messages.'.format(number_of_videos,number_of_videos/number_of_messages*100, number_of_messages))
print('{} of which were GIFs, or {:.3f}% of the total {} messages.'.format(number_of_GIFs,number_of_GIFs/number_of_messages*100, number_of_messages))
print('{} of which were stickers, or {:.3f}% of the total {} messages.'.format(number_of_stickers,number_of_stickers/number_of_messages*100, number_of_messages))
print('{} of which were documents, or {:.3f}% of the total {} messages.'.format(number_of_documents,number_of_documents/number_of_messages*100, number_of_messages))
print('\n===The documents were:===')
for doc in documents:
    print(doc)

print('\nA total of {} messages sent were deleted, or {:.2f}% of the total {} messages.'.format(number_of_deleted_messages, number_of_deleted_messages/number_of_messages * 100, number_of_messages))

**Calculate number of words sent**

In [None]:
data['Number of Words'] = data['Message'].apply(lambda x: len(x.split()))
number_of_words = data['Number of Words'].sum()

document_words = [len(doc.split()) for doc in documents]
document_words = sum(document_words)

number_of_words = number_of_words - (number_of_pictures*2) - (number_of_GIFs*2) - (number_of_videos*2) - (number_of_stickers*2) - (number_of_documents*2) - document_words

print('They sent a total of {:,} words, for an average of {:.2f} words per (text only) message.'.format(number_of_words, number_of_words/number_text_message))


most_common_messages = pd.DataFrame(data.Message.value_counts())
most_common_messages.drop(['‎This message was deleted.','‎image omitted','‎You deleted this message.', '‎video omitted', '‎GIF omitted'], inplace=True)

print("\nThe 50 most common messages were:")
most_common_messages.head(50) #this is most common message

In [None]:
words = Counter()
data['Message'].str.lower().str.split().apply(words.update)

print("Top 100 most commonly used words:")
words.most_common(100)

**Get Emojis**

In [760]:
def get_emojis(string):
    return ''.join([ch for ch in string if ch in UNICODE_EMOJI])


emojis = Counter(data['Message'].apply(lambda x: get_emojis(x)).sum())
emojis = {k: v for k,v in sorted(emojis.items(), key=lambda item: item[1], reverse=True)}
print('There are {} unique emojis used in this chat.'.format(len(emojis)))
print('A total of {} emojis were used.'.format(sum(emojis.values())))

print('These are all the emojis used, sorted in reverse ascending order:\n')
for emoji,number in emojis.items():
    print(emoji, number)

There are 178 unique emojis used in this chat.
A total of 9220 emojis were used.
These are all the emojis used, sorted in reverse ascending order:

😂 7548
😭 234
♀ 155
🤦 122
🏼 102
👋 84
🏻 77
😒 59
🥺 53
😴 52
🤷 39
🤪 38
❤ 34
😊 33
😅 30
😢 29
😞 25
😑 21
👏 19
😓 18
☀ 17
😨 16
♂ 15
🎉 15
😤 13
☺ 13
🎊 13
😁 12
😬 12
🤣 10
😈 10
👍 10
🍪 9
😖 8
🤮 8
👌 7
😍 7
🚬 7
😏 7
😄 6
💤 6
😐 6
😔 6
🤎 5
🙋 5
💔 5
😪 4
🙄 4
😡 4
😎 4
💓 4
🙈 4
🇺 4
🇸 4
😇 3
😌 3
😚 3
😘 3
🐣 3
🥰 3
😠 3
⬆ 3
‼ 3
🥳 3
🎈 3
🍡 2
💰 2
🦠 2
🍳 2
🙃 2
🥴 2
😉 2
✨ 2
🙇 2
🕛 2
🦥 2
🍯 2
♾ 2
😶 2
😣 2
🤝 2
🤭 2
💜 2
😃 2
😱 2
🤐 2
☹ 2
👎 1
🧀 1
🕉 1
🍰 1
🇨 1
🇦 1
🦷 1
👐 1
🍹 1
🧠 1
⭐ 1
👀 1
🥨 1
🐊 1
🙁 1
😗 1
😙 1
🔥 1
🧐 1
😮 1
🌬 1
☃ 1
🙅 1
😧 1
💦 1
🔫 1
☎ 1
🍾 1
🕧 1
🕐 1
🕜 1
🕑 1
🕝 1
🕒 1
🕞 1
🕓 1
🕟 1
🕔 1
🕠 1
🕕 1
🕡 1
🕖 1
🕢 1
🕗 1
🕣 1
🕘 1
🕤 1
🕙 1
🕥 1
🕚 1
🕦 1
🤗 1
🏫 1
🛑 1
🚀 1
🇪 1
🇬 1
🗼 1
🦄 1
🐒 1
🚿 1
🌊 1
🤖 1
🧟 1
🐦 1
🎨 1
🗺 1
👩 1
🦰 1
🍂 1
☕ 1
❇ 1
🧸 1
🥶 1
🦋 1
🦚 1
❗ 1
🦅 1
😵 1
🧇 1
🥱 1
🦧 1
😫 1
🤢 1
🥜 1
😩 1
😋 1
🎩 1
📺 1
🤔 1
🤤 1


In [None]:
data_as_string = data['Message'].str

avg_char = data_as_string.len().mean()
characters = data_as_string.len().sum() - 77 #fix 

doc_lengths = [len(doc) for doc in documents]
chars_in_docs = 0
for i in doc_lengths:
    chars_in_docs += i
characters -= chars_in_docs

print('A total of {:,} characters were sent'.format(characters))
print('Each message sent had an average of {:.2f} characters'.format(avg_char))

def get_chars(string):
    return ''.join([ch for ch in string if ch not in UNICODE_EMOJI])

letters = Counter(data['Message'].apply(lambda x: get_chars(x)).sum())
letters = {k: v for k,v in sorted(letters.items(), key=lambda item: item[1], reverse=True)}
print('There are {} unique characters used in this chat.'.format(len(letters)))

print('These are all the characters used, sorted in reverse ascending order:\n')
for emoji,number in letters.items():
    print(emoji, number)

**Person 1 Analysis**

In [None]:
documents_list_df1 = df1['Message'].apply(lambda x: str(re.findall('.*document omitted', x)))
documents_list_df1 = documents_list_df1.to_frame()
documents_df1 = documents_list_df1['Message'].unique()
documents_df1 = [doc for doc in documents_df1 if len(doc) > 2]
documents_df1 = [doc.split("\\u200")[0].strip()[2:] for doc in documents_df1]

number_of_messages_df1 = df1.shape[0]
number_of_pictures_df1 = df1[df1['Message'] == '‎image omitted'].shape[0]
number_of_videos_df1 = df1[df1['Message'] == '‎video omitted'].shape[0]
number_of_stickers_df1 = df1[df1['Message'] == '‎sticker omitted'].shape[0]
number_of_GIFs_df1 = df1[df1['Message'] == '‎GIF omitted'].shape[0]
number_of_documents_df1 = len(documents_df1)
number_of_deleted_messages_df1 = df1[df1['Message'] == '‎This message was deleted.'].shape[0] +  df1[df1['Message'] == '‎You deleted this message.'].shape[0]

number_text_message_df1 = number_of_messages_df1 - number_of_pictures_df1 - number_of_GIFs_df1 - number_of_videos_df1 - number_of_stickers_df1 - number_of_documents_df1

print(names[0], 'has sent: {} messages, or {:.2f}% of the total {} messages.'.format(number_of_messages_df1, number_of_messages_df1/number_of_messages*100, data.shape[0]))
print('\n{:,} of which were text messages, or {:.2f}% of the total {} text messages.'.format(number_text_message_df1, number_of_messages_df1/number_text_message*100, number_text_message))
print('{} of which were pictures, or {:.2f}% of the total {} pictures.'.format(number_of_pictures_df1, number_of_pictures_df1/number_of_pictures*100, number_of_pictures))

print('{} of which were videos, or {:.2f}% of the total {} videos.'.format(number_of_videos_df1, number_of_videos_df1/number_of_videos*100, number_of_videos))
print('{} of which were GIFs, or {:.2f}% of the total {} GIFs.'.format(number_of_GIFs_df1, number_of_GIFs_df1/number_of_GIFs*100, number_of_GIFs))
print('{} of which were stickers, or {:.2f}% of the total {} stickers.'.format(number_of_stickers_df1, number_of_stickers_df1/number_of_stickers*100, number_of_stickers))
print('{} of which were documents, or {:.2f}% of the total {} documents.'.format(number_of_documents_df1, number_of_documents_df1/number_of_documents*100, number_of_documents))
print('\n===The documents were:===')
for doc in documents_df1:
    print(doc)
print("")
print(names[0],'deleted {} messages, or {:.2f}% of the total {} deleted messages.'.format(number_of_deleted_messages_df1, number_of_deleted_messages_df1/number_of_deleted_messages * 100, number_of_deleted_messages))

number_of_words_df1 = df1['Number of Words'].sum()

document_words_df1 = [len(doc.split()) for doc in documents_df1]
document_words_df1 = sum(document_words_df1)

number_of_words_df1 = number_of_words_df1 - (number_of_pictures_df1*2) - (number_of_GIFs_df1*2) - (number_of_videos_df1*2) - (number_of_stickers_df1*2) - (number_of_documents_df1*2) - document_words_df1

print("")
print(names[0], 'has sent: {:,} words, or {:.2f}% of the total {} words.'.format(number_of_words_df1, number_of_words_df1/number_of_words*100, number_of_words))
print(names[0], 'sends about {:.2f} words per message.'.format(number_of_words_df1/number_text_message_df1))

data_as_string_df1 = df1['Message'].str

avg_char_df1 = data_as_string_df1.len().mean()
characters_df1 = data_as_string_df1.len().sum() - 77 #fix

doc_lengths_df1 = [len(doc) for doc in documents_df1]
chars_in_docs_df1 = 0
for i in doc_lengths_df1:
    chars_in_docs_df1 += i
characters_df1 -= chars_in_docs_df1

print("")
print(names[0],'sent a total of {:,} characters, or {:.2f}% of the total {} characters.'.format(characters_df1, characters_df1/characters*100, characters))
print('Each message sent had an average of {:.2f} characters'.format(avg_char_df1))

In [None]:
most_common_messages_df1 = pd.DataFrame(df1.Message.value_counts())
most_common_messages_df1.drop(['‎This message was deleted.','‎image omitted', '‎video omitted', '‎GIF omitted', '‎sticker omitted'], inplace=True)

print("\nThe 25 most common messages for {} were:".format(names[0]))
most_common_messages_df1.head(25)

In [None]:
words_df1 = Counter()
df1['Message'].str.lower().str.split().apply(words_df1.update)

print("Top 50 most commonly used words for {}:".format(names[0]))
words_df1.most_common(50)

In [None]:
emojis_df1 = Counter(df1['Message'].apply(lambda x: get_emojis(x)).sum())
emojis_df1 = {k: v for k,v in sorted(emojis_df1.items(), key=lambda item: item[1], reverse=True)}
print(names[0],'used {} unique emojis in this chat.'.format(len(emojis_df1)))
print(names[0], 'used a total of {} emojis'.format(sum(emojis_df1.values())))

print('These are all the emojis used, sorted in reverse ascending order:\n')
for emoji,number in emojis_df1.items():
    print(emoji, number)

**Person 2 Analysis**

In [None]:
documents_list_df2 = df2['Message'].apply(lambda x: str(re.findall('.*document omitted', x)))
documents_list_df2 = documents_list_df2.to_frame()
documents_df2 = documents_list_df2['Message'].unique()
documents_df2 = [doc for doc in documents_df2 if len(doc) > 2]
documents_df2 = [doc.split("\\u200")[0].strip()[2:] for doc in documents_df2]

number_of_messages_df2 = df2.shape[0]
number_of_pictures_df2 = df2[df2['Message'] == '‎image omitted'].shape[0]
number_of_videos_df2 = df2[df2['Message'] == '‎video omitted'].shape[0]
number_of_stickers_df2 = df2[df2['Message'] == '‎sticker omitted'].shape[0]
number_of_GIFs_df2 = df2[df2['Message'] == '‎GIF omitted'].shape[0]
number_of_documents_df2 = len(documents_df2)
number_of_deleted_messages_df2 = df2[df2['Message'] == '‎This message was deleted.'].shape[0] + df2[df2['Message'] == '‎You deleted this message.'].shape[0]


number_text_message_df2 = number_of_messages_df2 - number_of_pictures_df2 - number_of_GIFs_df2 - number_of_videos_df2 - number_of_stickers_df2 - number_of_documents_df2

print(names[1], 'has sent: {} messages, or {:.2f}% of the total {} messages.'.format(number_of_messages_df2, number_of_messages_df2/number_of_messages*100, data.shape[0]))
print('\n{:,} of which were text messages, or {:.2f}% of the total {} text messages.'.format(number_text_message_df2, number_of_messages_df2/number_text_message*100, number_text_message))
print('{} of which were pictures, or {:.2f}% of the total {} pictures.'.format(number_of_pictures_df2, number_of_pictures_df2/number_of_pictures*100, number_of_pictures))

print('{} of which were videos, or {:.2f}% of the total {} videos.'.format(number_of_videos_df2, number_of_videos_df2/number_of_videos*100, number_of_videos))
print('{} of which were GIFs, or {:.2f}% of the total {} GIFs.'.format(number_of_GIFs_df2, number_of_GIFs_df2/number_of_GIFs*100, number_of_GIFs))
print('{} of which were stickers, or {:.2f}% of the total {} stickers.'.format(number_of_stickers_df2, number_of_stickers_df2/number_of_stickers*100, number_of_stickers))
print('{} of which were documents, or {:.2f}% of the total {} documents.'.format(number_of_documents_df2, number_of_documents_df2/number_of_documents*100, number_of_documents))
print('\n===The documents were:===')
for doc in documents_df2:
    print(doc)
print("")
print(names[1],'deleted {} messages, or {:.2f}% of the total {} deleted messages.'.format(number_of_deleted_messages_df2, number_of_deleted_messages_df2/number_of_deleted_messages * 100, number_of_deleted_messages))

number_of_words_df2 = df2['Number of Words'].sum()

document_words_df2 = [len(doc.split()) for doc in documents_df2]
document_words_df2 = sum(document_words_df2)

number_of_words_df2 = number_of_words_df2 - (number_of_pictures_df2*2) - (number_of_GIFs_df2*2) - (number_of_videos_df2*2) - (number_of_stickers_df2*2) - (number_of_documents_df2*2) - document_words_df2

print("")
print(names[1], 'has sent: {:,} words, or {:.2f}% of the total {} words.'.format(number_of_words_df2, number_of_words_df2/number_of_words*100, number_of_words))
print(names[1], 'sends about {:.2f} words per message.'.format(number_of_words_df2/number_text_message_df2))

data_as_string_df2 = df2['Message'].str

avg_char_df2 = data_as_string_df2.len().mean()
characters_df2 = data_as_string_df2.len().sum() - 77 #fix

doc_lengths_df2 = [len(doc) for doc in documents_df2]
chars_in_docs_df2 = 0
for i in doc_lengths_df2:
    chars_in_docs_df2 += i
characters_df2 -= chars_in_docs_df2

print("")
print(names[1],'sent a total of {:,} characters, or {:.2f}% of the total {} characters.'.format(characters_df2, characters_df2/characters*100, characters))
print('Each message sent had an average of {:.2f} characters'.format(avg_char_df2))

In [None]:
most_common_messages_df2 = pd.DataFrame(df2.Message.value_counts())
most_common_messages_df2.drop(['‎You deleted this message.','‎image omitted', '‎video omitted', '‎GIF omitted', '‎sticker omitted'], inplace=True)

print("\nThe 25 most common messages for {} were:".format(names[1]))
most_common_messages_df2.head(25)

In [None]:
words_df2 = Counter()
df2['Message'].str.lower().str.split().apply(words_df2.update)

print("Top 50 most commonly used words for {}:".format(names[1]))
words_df2.most_common(50)

In [None]:
emojis_df2 = Counter(df2['Message'].apply(lambda x: get_emojis(x)).sum())
emojis_df2 = {k: v for k,v in sorted(emojis_df2.items(), key=lambda item: item[1], reverse=True)}
print(names[1],'used {} unique emojis in this chat.'.format(len(emojis_df2)))
print(names[1], 'used a total of {} emojis'.format(sum(emojis_df2.values())))

print('These are all the emojis used, sorted in reverse ascending order:\n')
for emoji,number in emojis_df2.items():
    print(emoji, number)

**VISUALIZATIONS**

In [None]:
data['Message Length'] = data['Message'].str.len()

In [None]:
data['Date'] = [datetime.datetime.strptime(date, "%d/%m/%y").date() for date in data['Date']]

In [None]:
data['Time'] = [datetime.datetime.strptime(time, "%I:%M:%S %p").time() for time in data['Time']]

In [None]:
data.groupby(['Date']).size().plot(figsize=(25,6), title='Number Of Messages Each Day')

In [None]:
min_messages_date = data.groupby(['Date']).size().idxmin()
min_messages = data.groupby(['Date']).size().min()
print('On',min_messages_date,'was the lowest number of messages sent, which was:',min_messages)

max_messages_date = data.groupby(['Date']).size().idxmax()
max_messages = data.groupby(['Date']).size().max()
print('On',max_messages_date,'was the highest number of messages sent, which was:',max_messages)

In [None]:
data.groupby('Date')['Number of Words'].sum().plot(figsize=(25,6), title='Total Number Of Words per Day')

In [None]:
data.groupby('Date')['Number of Words'].mean().plot(figsize=(25,6), title='Average Number Of Words per Message per Day')

In [None]:
data.groupby('Date')['Message Length'].sum().plot(figsize=(25,6), title='Total Number Of Characters per Day')

In [None]:
data.groupby('Date')['Message Length'].mean().plot(figsize=(25,6), title="Average Number of Characters per Message per Day")

In [None]:
data['Reply Time'] = 0
for i in range(len(data)-2):
    data['Reply Time'][i+1] = datetime.datetime.combine(data['Date'][i+1],data['Time'][i+1]) - datetime.datetime.combine(data['Date'][i],data['Time'][i])


In [None]:
data.groupby(['Date','Author']).count()['Message'].unstack().plot(figsize=(25,6), title=names[0]+' vs '+names[1]+' in number of messages sent per Day')

In [None]:
data.groupby(['Date','Author'])['Number of Words'].sum().unstack().plot(figsize=(25,6), title=names[0]+' vs '+names[1]+' in number of words sent per Day')

In [None]:
data.groupby(['Date','Author'])['Message Length'].mean().unstack().plot(figsize=(25,6), title=names[0]+' vs '+names[1]+' in average message length per Day')

In [None]:
data['Reply Time'][0] = data['Reply Time'][1]
data['Reply Time'][len(data)-1] = data['Reply Time'][1]
data['Reply Time'] = data['Reply Time'].apply(lambda x: x.total_seconds())

In [None]:
data['Convo ID'] = (data['Reply Time'] > 1200).cumsum().fillna(0).astype(int) + 1

print('There are '+str(data['Convo ID'].max())+' unique conversations.')

In [None]:
data.groupby(['Convo ID']).count()['Message'].plot(figsize=(25,6),title="Number of messages per conversation")

In [None]:
data.groupby(['Convo ID'])['Number of Words'].mean().plot(figsize=(25,6),title="Average Number of words per message per conversation")

In [None]:
data.groupby(['Convo ID'])['Message Length'].mean().plot(figsize=(25,6),title="Average Length of Message per conversation")

In [571]:
xc = data[data['Convo ID'].between(200,250)]
record = xc[xc['Message Length'] == 2887]

In [None]:
print('This is that super long ass message:')
record

In [None]:
print("Your reply to all that effort :(")
xc[xc['Convo ID'] == 230].iloc[0].Message

In [584]:
print('Average reply time for '+names[0]+' is {:.2f}'.format(data[data['Author'] == names[0]]['Reply Time'].mean()))
print('Average reply time for '+names[1]+' is {:.2f}'.format(data[data['Author'] == names[1]]['Reply Time'].mean()))

print('\n^^ This is not representative of the TRUE reply time, as it depends on how quickly replies are sent after a convo is over')

Average reply time for Shaw 🐣 is 579.51
Average reply time for Preetika Sastry is 493.79

^^ This is not representative of the TRUE reply time, as it depends on how quickly replies are sent after a convo is over


In [None]:
data.groupby('Convo ID')['Reply Time'].mean().div(1000).plot(figsize=(25,6), title='Average Reply Time in Seconds per Converation')

In [None]:
data.groupby(['Convo ID','Author'])['Reply Time'].mean().div(1000).unstack().plot(figsize=(25,6), title='Average Reply Time in Seconds per Converation')

In [587]:
data['Day of Week'] = data.Date.apply(lambda x: x.weekday())


In [None]:
data.sort_values(by=['Day of Week']).groupby('Day of Week').count()['Message'].plot(figsize=(25,6), ylabel='Number of Messages', title='Day of Week vs Number of Messages').set_xticklabels(['','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])

In [589]:
data['Time2'] = data.Time.apply(lambda x: x.replace(second=0))
data['Time3'] = data.Time.apply(lambda x: x.replace(minute=0, second=0))

In [None]:
data.groupby(['Time2']).count()['Message'].plot(figsize=(25,6), ylabel='Number of Messages', title='Time of Day (Minutes) vs Number of Messages')

In [None]:
data.groupby(['Time3']).count()['Message'].plot(figsize=(25,6), ylabel='Number of Messages', title='Time of Day (Hours) vs Number of Messages')

In [None]:
print(names[0]+' has started {} conversations'.format(data.groupby('Convo ID').first()['Author'].value_counts()[0]))
print(names[1]+' has started {} conversations'.format(data.groupby('Convo ID').first()['Author'].value_counts()[1]))
print("")
print(names[0]+' said "bye" last in {} conversations'.format(data.groupby('Convo ID').last()['Author'].value_counts()[0]))
print(names[1]+' said "bye" last in {} conversations'.format(data.groupby('Convo ID').last()['Author'].value_counts()[1]))

In [None]:
convo_starts = data.groupby('Convo ID').first()
convo_ends = data.groupby('Convo ID').last()
convo_dur = pd.concat([convo_starts,convo_ends]).drop_duplicates(keep=False).sort_values(['Date','Time']).reset_index()


convo_dur['Convo Time'] = 0
for i in range(len(convo_dur)-2):
    convo_dur['Convo Time'][i+1] = datetime.datetime.combine(convo_dur['Date'][i+1],convo_dur['Time'][i+1]) - datetime.datetime.combine(convo_dur['Date'][i],convo_dur['Time'][i])
convo_dur.drop([len(convo_dur)-1,len(convo_dur)-2], inplace=True)

convo_dur = convo_dur.iloc[1::2]

In [726]:
max_time = convo_dur['Convo Time'].max().seconds
print('Longest time chatted is: {}h {}m {}s'.format(max_time//3600,(max_time%3600)//60, max_time%60))

Longest time chatted is: 4h 19m 57s


In [None]:
avg_chat_length = sum(convo_dur['Convo Time'], datetime.timedelta(0)) / len(convo_dur['Convo Time'])
print('Average chat length is: {}h {}m {}s'.format(avg_chat_length.seconds//3600,(avg_chat_length.seconds%3600)//60, avg_chat_length.seconds%60))

print("\n^^ This is a naive calculation, and not truly representative of the true mean")

In [None]:
data

In [None]:
drive.flush_and_unmount()