-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessor.py
174 lines (100 loc) · 5.86 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
'''Importing Libraries'''
import re
import pandas as pd
'''Loading the chat file to a variable named 'data'''
# Opening the chat file and storing it to variable 'f'
f = open('Chat File.txt', 'r', encoding='utf-8')
# To read the contents of the chat file in a string format, the following expression was used
data = f.read()
# Viewing the chat file to check how the data is organised
print(data)
print(type(data))
'''To better understand what the data is saying, it first has to be broken down into different columns on the basis of similarity'''
# Breaking the string into 2 different expressions using a common pattern
pattern = '\d{1,2}\/\d{1,2}\/\d{2},\s\d{1,2}:\d{2}\s(?:am|pm)\s-\s'
messages = re.split(pattern, data)[1:] # This contains all the textual messages shared
dates = re.findall(pattern, data) # This contains all the information about the date and time the messages when the messages were shared
'''As the chat is in string format, it has to be converted into a Pandas Dataframe for further analysis'''
# Making a Pandas dataframe named 'df'
df = pd.DataFrame({'date': dates, 'user_message': messages})
# Converting 'message_date' type to datetime format
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%y, %I:%M %p - ')
df.head()
# Checking the shape of data
df.shape
'''Since in the 'user_message' column the values are combined, the name of the sender and the message they sent. They must be seperated into different columns.'''
users = [] # An empty list to store usernames
messages = [] # An empty list to store messages
# A forloop to iterate over each message in the 'user_message' column of the DataFrame 'df'
for message in df['user_message']:
# Split the message using a regular expression pattern '([\w\W]+?):\s':
# '([\w\W]+?)' captures any sequence of characters (including non-word characters) up to the first colon and space
# ':\s' matches the colon and space that separates the username from the message content
entry = re.split('([\w\W]+?):\s', message)
if entry[1:]: # Checks if there is a username (the slicing [1:] skips the first empty string in the split result)
users.append(entry[1]) # Appends the captured username to the 'users' list
messages.append(" ".join(entry[2:])) # Append the message content (joined as a string) to the 'messages' list
else:
users.append('Group_Notification') # If no username is found, the code will assume it's a group notification and add a placeholder
messages.append(entry[0]) # Appends the entire message (without username) to the 'messages' list
# Add the 'users' and 'messages' lists as new columns to the DataFrame 'df'
df['user'] = users
df['message'] = messages
# Dropping the unnecessary column
df.drop(columns=['user_message'], inplace=True)
# Date, User (Message Sender), and Message are now in 3 seperate columns
df.head()
'''Now,the problem with the date column is that it contains the date as well as the time the message was shared.
This needs to be addressed by dividing them into seperate columns.'''
# Creating a new column 'only_date' that contains only the date part of the 'date' column.
df['only_date'] = df['date'].dt.date
# Creating a new column 'year' that contains the year extracted from the 'date' column.
df['year'] = df['date'].dt.year
# Creating a new column 'month_num' that contains the month (as a number) extracted from the 'date' column.
df['month_num'] = df['date'].dt.month
# Creating a new column 'month' that contains the full name of the month extracted from the 'date' column.
df['month'] = df['date'].dt.month_name()
# Creating a new column 'day' that contains the day of the month extracted from the 'date' column.
df['day'] = df['date'].dt.day
# Creating a new column 'day_name' that contains the name of the day of the week extracted from the 'date' column.
df['day_name'] = df['date'].dt.day_name()
# Creating a new column 'hour' that contains the hour extracted from the 'date' column.
df['hour'] = df['date'].dt.hour
# Creating a new column 'minute' that contains the minute extracted from the 'date' column.
df['minute'] = df['date'].dt.minute
# Removing the 'date' column as it is not useful anymore
df.drop(columns=['date'], inplace=True)
df.head()
'''To create an automated process for extracting data from a WhatsApp chat file, preprocessing it, and returning the result as a Pandas DataFrame, the code provided above should be encapsulated within a function
To be called and reused in different Python scripts or files, the function is written into this same Python file named 'preprocessor.py'''
def preprocess(data):
import re
import pandas as pd
pattern = '\d{1,2}\/\d{1,2}\/\d{2},\s\d{1,2}:\d{2}\s(?:am|pm)\s-\s'
messages = re.split(pattern, data)[1:]
dates = re.findall(pattern, data)
df = pd.DataFrame({'date': dates, 'user_message': messages})
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%y, %I:%M %p - ')
users = []
messages = []
for message in df['user_message']:
entry = re.split('([\w\W]+?):\s', message)
if entry[1:]:
users.append(entry[1])
messages.append(" ".join(entry[2:]))
else:
users.append('group_notification')
messages.append(entry[0])
df['user'] = users
df['message'] = messages
df.drop(columns=['user_message'], inplace=True)
df['only_date'] = df['date'].dt.date
df['year'] = df['date'].dt.year
df['month_num'] = df['date'].dt.month
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df.drop(columns=['date'], inplace=True)
return df