In [1]:
import re
import pandas as pd
import numpy as np
import emoji
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [2]:
f = open('chat.txt', 'r', encoding = 'utf-8')

In [3]:
data = f.read()

In [4]:
# print(data)

In [5]:
pattern = '\d{1,2}/\d{1,2}/\d{1,4},\s\d{1,2}:\d{1,2}\s[AaPp][Mm]\s-\s'

In [6]:
messages = re.split(pattern, data)[1:]
# messages

In [7]:
dates = re.findall(pattern, data)
# dates

In [8]:
print('messages', len(messages))
print('dates', len(dates))

messages 3019
dates 3019


In [9]:
df = pd.DataFrame({'user_message' : messages, 'date' : dates})

df.head()

Unnamed: 0,user_message,date
0,Messages and calls are end-to-end encrypted. N...,"12/19/21, 11:09 PM -"
1,Ankita: Kidhr h tu\n,"12/19/21, 11:09 PM -"
2,Shubham😎: 🚂\n,"12/20/21, 12:18 AM -"
3,Ankita: Train m\n,"12/20/21, 12:18 AM -"
4,Ankita: 🤗\n,"12/20/21, 12:18 AM -"


In [10]:
df['date'] = pd.to_datetime(df['date'], format = '%m/%d/%y, %I:%M %p - ')
df.head()

Unnamed: 0,user_message,date
0,Messages and calls are end-to-end encrypted. N...,2021-12-19 23:09:00
1,Ankita: Kidhr h tu\n,2021-12-19 23:09:00
2,Shubham😎: 🚂\n,2021-12-20 00:18:00
3,Ankita: Train m\n,2021-12-20 00:18:00
4,Ankita: 🤗\n,2021-12-20 00:18:00


In [11]:
df['user_message'] = df['user_message'].str.strip()
df.head()

Unnamed: 0,user_message,date
0,Messages and calls are end-to-end encrypted. N...,2021-12-19 23:09:00
1,Ankita: Kidhr h tu,2021-12-19 23:09:00
2,Shubham😎: 🚂,2021-12-20 00:18:00
3,Ankita: Train m,2021-12-20 00:18:00
4,Ankita: 🤗,2021-12-20 00:18:00


In [12]:
# separate users and messages
    
users = []
messages = []

for message in df['user_message']:
    entry = re.split('([\w\W]+?):\s', message)
    if entry[1:]:
        users.append(entry[1])
        messages.append(entry[2])
    else:
        users.append('chat_notification')
        messages.append(entry[0])
        

df['users'] = users
df['messages'] = messages
df.drop(columns = ['user_message'], inplace = True)

df.head()

Unnamed: 0,date,users,messages
0,2021-12-19 23:09:00,chat_notification,Messages and calls are end-to-end encrypted. N...
1,2021-12-19 23:09:00,Ankita,Kidhr h tu
2,2021-12-20 00:18:00,Shubham😎,🚂
3,2021-12-20 00:18:00,Ankita,Train m
4,2021-12-20 00:18:00,Ankita,🤗


In [13]:
df = df[df['users'] != 'chat_notification']
df.head()

Unnamed: 0,date,users,messages
1,2021-12-19 23:09:00,Ankita,Kidhr h tu
2,2021-12-20 00:18:00,Shubham😎,🚂
3,2021-12-20 00:18:00,Ankita,Train m
4,2021-12-20 00:18:00,Ankita,🤗
5,2021-12-20 00:19:00,Shubham😎,Hanji


In [14]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month_name()
df['day'] = df['date'].dt.day
df['day_name'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['time'] = df['date'].dt.time
df['date'] = df['date'].dt.date

df.head()

Unnamed: 0,date,users,messages,year,month,day,day_name,hour,minute,time
1,2021-12-19,Ankita,Kidhr h tu,2021,December,19,Sunday,23,9,23:09:00
2,2021-12-20,Shubham😎,🚂,2021,December,20,Monday,0,18,00:18:00
3,2021-12-20,Ankita,Train m,2021,December,20,Monday,0,18,00:18:00
4,2021-12-20,Ankita,🤗,2021,December,20,Monday,0,18,00:18:00
5,2021-12-20,Shubham😎,Hanji,2021,December,20,Monday,0,19,00:19:00


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3015 entries, 1 to 3018
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date      3015 non-null   object
 1   users     3015 non-null   object
 2   messages  3015 non-null   object
 3   year      3015 non-null   int64 
 4   month     3015 non-null   object
 5   day       3015 non-null   int64 
 6   day_name  3015 non-null   object
 7   hour      3015 non-null   int64 
 8   minute    3015 non-null   int64 
 9   time      3015 non-null   object
dtypes: int64(4), object(6)
memory usage: 259.1+ KB


In [16]:
df.isnull().sum()

date        0
users       0
messages    0
year        0
month       0
day         0
day_name    0
hour        0
minute      0
time        0
dtype: int64

In [17]:
import nltk

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [18]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiments = SentimentIntensityAnalyzer()

In [19]:
df['positive_sent_%'] = [sentiments.polarity_scores(i)["pos"]*100 for i in df['messages']]
df['negative_sent_%'] = [sentiments.polarity_scores(i)["neg"]*100 for i in df['messages']]
df['neutral_sent_%'] = [sentiments.polarity_scores(i)["neu"]*100 for i in df['messages']]

df.head()

Unnamed: 0,date,users,messages,year,month,day,day_name,hour,minute,time,positive_sent_%,negative_sent_%,neutral_sent_%
1,2021-12-19,Ankita,Kidhr h tu,2021,December,19,Sunday,23,9,23:09:00,0.0,0.0,100.0
2,2021-12-20,Shubham😎,🚂,2021,December,20,Monday,0,18,00:18:00,0.0,0.0,0.0
3,2021-12-20,Ankita,Train m,2021,December,20,Monday,0,18,00:18:00,0.0,0.0,100.0
4,2021-12-20,Ankita,🤗,2021,December,20,Monday,0,18,00:18:00,0.0,0.0,0.0
5,2021-12-20,Shubham😎,Hanji,2021,December,20,Monday,0,19,00:19:00,0.0,0.0,100.0


In [20]:
x = sum(df['positive_sent_%'])
y = sum(df['negative_sent_%'])
z = sum(df['neutral_sent_%'])

def score(a,b,c):
    if (a > b) and (a > c):
        print('Positive')
    if (b > a) and (b > c):
        print('Negative')
    if (c > a) and (c > b):
        print('Neutral')

score(x,y,z)

Neutral
