In [19]:
from pathlib import Path
import csv
import pandas as pd
from datetime import datetime

In [20]:
# loading the data

notebook_dir = Path.cwd()
# the data is located in data/raw folder
data_dir = notebook_dir.parent.absolute() / 'data_raw' 
csv_dir = notebook_dir.parent.absolute() / 'clean_csvs' 

#convert txt of instagram texts to strings
path_raw = data_dir / 'aarush_messenger.txt'
path_csv_grouped = csv_dir / 'messages_grouped_by_user' / 'aarush-messenger-grouped.csv'
path_csv_separate = csv_dir / 'messages_separate' / 'aarush-messenger-separate.csv'

In [21]:
with open(path_raw) as fp:
    messenger_data = fp.read()


In [22]:
# remove the beginning part of the txt file
beginIndex = messenger_data.find('\n\n\n')
messenger_data = messenger_data[beginIndex + 3:]
messenger_data = messenger_data.strip()



In [23]:
# mark the usernames with a special key
special_key = 'Nnọọ' #when would we ever say hello in Igbo in our chat? it's more likely than you think
messenger_data = messenger_data.replace('\nAarush Agte', '\n' + special_key + 'Aarush Agte')
messenger_data = messenger_data.replace('\nSamyukta Athreya', '\n' + special_key + 'Samyukta Athreya')

In [24]:
while messenger_data.find('\n\n') != -1:
    messenger_data = messenger_data.replace('\n\n', '\n')

In [25]:
filename = "output.txt"

# Open a file in write mode ('w') and write the string to it
with open(filename, 'w') as file:
    file.write(messenger_data)

print(f"String has been written to {filename}")

String has been written to output.txt


In [26]:
def getWordCount(messageLine):
    messageLine = messageLine.strip()
    return(len(messageLine.split(' ')))

In [27]:
# figuring out how to read the messenger data
#columns for csv:
#'date', 'user', 'text', 'word count', 'number of separated texts', 'reactions', 'platform'
#type of reply options: story reply, reel, normal message

users = {'Samyukta Athreya', 'Aarush Agte'}
months = {'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'}
user, date, text, word_count, num_sep_texts, reactions = '', '', '', '', '', ''

message_dict = []

for message_block in messenger_data.split(special_key): #split data by user
    for line in message_block.splitlines():

        line = line.strip()

        #check if it's a user line
        if line == 'Samyukta Athreya':
            user = 'Samyukta'
        elif line == 'Aarush Agte':
            user = 'Aarush'

        #check if it's a date line
        elif line[:3] in months or line[1:4] in months:
            date = line
        
        #check if it's a reaction line
        elif line.find('Samyukta Athreya') != -1 or line.find('Aarush Agte') != -1:
            nameIndex = line.find('Samyukta Athreya')
            if nameIndex == -1:
                nameIndex = line.find('Aarush Agte')
            reactions = line[:nameIndex]
        
        else:
            text = line
            word_count = getWordCount(line)
    res = {
        'date' : date, 
        'user' : user, 
        'text' : text, 
        'word count' : word_count, 
        'number of separated texts' : num_sep_texts, 
        'reactions' : reactions,  
        'platform' : 'Messenger'
        }
    user, date, text, word_count, num_sep_texts, reactions = '', '', '', '', '', ''
    message_dict.append(res)
print(message_dict)



In [28]:
#save dictionary as csv
aarushMsngrCSV = open(path_csv_grouped, 'w')
writer = csv.writer(aarushMsngrCSV)
writer.writerow(['date', 'user', 'text', 'word count', 'number of separated texts', 'reactions', 'platform'])
for dictionary in message_dict:
    writer.writerow(dictionary.values())
aarushMsngrCSV.close()
aarushMsngrCSV = open(path_csv_grouped, 'r')
print("The content of the csv file is:")
print(aarushMsngrCSV.read())
aarushMsngrCSV.close()

The content of the csv file is:
date,user,text,word count,number of separated texts,reactions,platform

"Jan 12, 2023 2:18:34pm",Aarush,,,,,Messenger

"Jan 12, 2023 1:58:31pm",Samyukta,SH i connected the dots,5,,,Messenger

"Jan 12, 2023 1:52:42pm",Aarush,THATS WHY IT WAS FUNNY,5,,,Messenger

"Jan 12, 2023 1:52:37pm",Aarush,like string theory,3,,,Messenger

"Jan 12, 2023 1:52:34pm",Aarush,OMG UNRAVEL,2,,,Messenger

"Jan 12, 2023 1:51:21pm",Samyukta,ALSO HE IS RISEN IS JUST EASTER,7,,,Messenger

"Jan 12, 2023 1:34:59pm",Samyukta,This is killing me -John H,6,,,Messenger

"Jan 12, 2023 1:34:51pm",Samyukta,,,,,Messenger

"Jan 12, 2023 1:34:44pm",Samyukta,My discord isn’t working,4,,,Messenger

"Mar 03, 2022 7:48:55am",Aarush,,,,❤,Messenger

"Mar 03, 2022 7:47:50am",Aarush,,1,,❤,Messenger

"Feb 18, 2022 10:26:51am",Samyukta,https://colonist.io/#Yhp2,1,,,Messenger

"Feb 17, 2022 4:27:05pm",Aarush,ur gay,2,,,Messenger


"Feb 17, 2022 2:34:46pm",Aarush,ur so gay,3,,,Messenger

"Feb 17, 2022 2:

In [29]:
msngr_df = pd.read_csv(path_csv_grouped)
msngr_df.drop(index=msngr_df.index[0], axis=0, inplace=True) 

print(msngr_df.duplicated().sum())
#drop duplicates
msngr_df.drop_duplicates(inplace = True)

#make sure there are no duplicates left
print(msngr_df.duplicated().sum())

5
0


In [30]:
# Function to convert date format
def convert_date_format(date_str):
    try:
        # Convert the date string to a datetime object
        dt = pd.to_datetime(date_str, format='%b %d, %Y %I:%M:%S%p')
        # Convert datetime object to the desired format
        return dt.strftime('%Y-%m-%d %H:%M:%S')
    except ValueError:
        # If the date format is incorrect or not parseable, return the original string
        return date_str

# Apply the conversion function to the 'date' column
msngr_df['date'] = msngr_df['date'].apply(convert_date_format)


In [31]:
print(msngr_df.columns)
#date,user,text,word count,reactions,platform
df = msngr_df[['date', 'user', 'text', 'word count', 'reactions', 'platform']]
print(df.duplicated().sum())
df.to_csv(path_csv_separate)

Index(['date', 'user', 'text', 'word count', 'number of separated texts',
       'reactions', 'platform'],
      dtype='object')
0


In [32]:
# get number of separated texts

#'date', 'user', 'text', 'word count', 'number of separated texts', 'reactions', 'game pigeon', 'platform'
#group all the texts from one user that are next to each other in the csv. 
#the text column should have all the texts from that user combined into one string and separated by \n
#the number of separated texts column should have the number of rows that were merged

# Function to combine consecutive messages from the same user
def combine_consecutive_messages(df):
    # Create a column to mark groups of consecutive rows with the same user
    df['group'] = (df['user'] != df['user'].shift()).cumsum()
    
    # Group by 'user' and 'group', and aggregate the texts and count of messages
    result_df = df.groupby(['user', 'group']).agg({
        'text': lambda x: '\n'.join(x),  # Combine texts with newline separator
        'user': 'size'  # Count number of rows merged
    }).rename(columns={'user': 'number of separated texts'})
    
    # Reset index to convert 'user' and 'group' to columns
    result_df = result_df.reset_index()
    
    # Drop the 'group' column as it is no longer needed
    result_df = result_df.drop(columns=['group'])
    
    return result_df
msngr_df['text'] = msngr_df['text'].fillna('')
msngr_df['group'] = (msngr_df['user'] != msngr_df['user'].shift()).cumsum()
msngr_df['number of separated texts'] = msngr_df['group']
result_df = msngr_df.groupby(['user', 'group']).agg({
    'date' : 'first',
    'text': lambda x: '\n'.join(x),  # Combine texts with newline separator
    'word count' : 'sum',
    'user': 'size',  # Count number of rows merged,
    'reactions' : 'first',
    'platform' : 'first'
}).rename(columns={'user': 'number of separated texts'})

#['date', 'user', 'text', 'word count', 'number of separated texts', 'reactions', 'type of reply', 'platform']

# Reset index to convert group columns back to regular columns
result_df = result_df.reset_index()
result_df = result_df.sort_values(by='date')
result_df = result_df.drop('group', axis = 1)

In [33]:
print(result_df.head())

         user                 date  \
420  Samyukta  2019-09-19 18:39:34   
209    Aarush  2021-01-27 20:32:47   
419  Samyukta  2021-01-27 21:23:26   
208    Aarush  2021-01-27 21:31:40   
418  Samyukta  2021-01-28 17:16:26   

                                                  text  word count  \
420                 You are now connected on Messenger         6.0   
209       hey,, i don't know u lol,, let's be friends?         9.0   
419  omg are you like worldstar famous animator aar...         9.0   
208                               r u the shupershammy         4.0   
418                               omg how did you know         5.0   

     number of separated texts reactions   platform  
420                          1      None  Messenger  
209                          1      None  Messenger  
419                          1      None  Messenger  
208                          1      None  Messenger  
418                          1      None  Messenger  


In [34]:
#swap the order of columns so that date comes first
new_column_order = ['date', 'user'] + [col for col in result_df.columns if col not in ['date', 'user']]

# Reorder DataFrame columns
result_df = result_df[new_column_order]


In [35]:
print(result_df.tail())

                    date      user  \
211  2023-01-12 13:51:21  Samyukta   
0    2023-01-12 13:52:42    Aarush   
210  2023-01-12 13:58:31  Samyukta   
111                 None    Aarush   
390                 None  Samyukta   

                                                  text  word count  \
211  ALSO HE IS RISEN IS JUST EASTER\nThis is killi...        17.0   
0    THATS WHY IT WAS FUNNY\nlike string theory\nOM...        10.0   
210                            SH i connected the dots         5.0   
111                                               good         1.0   
390  why is there no fire emoji on messenger that i...        12.0   

     number of separated texts reactions   platform  
211                          4      None  Messenger  
0                            3      None  Messenger  
210                          1      None  Messenger  
111                          1      None  Messenger  
390                          1      None  Messenger  


In [36]:
result_df.to_csv(path_csv_grouped)