<a href="https://colab.research.google.com/github/pszemraj/ai-msgbot/blob/main/colab-notebooks/EDA_exported_phone_text_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EDA - exported phone text data

- A notebook for exploring data structure of exported message files. This then may play into any adjustments made to the scripts in `parsing-messages`
- yes, the example is part of my texts with my mom.

In [None]:
%%capture

!pip install -U pandas
!pip install -U pandas-profiling
from pandas_profiling import ProfileReport


import pandas as pd
import numpy as np

In [None]:
%%capture

!pip install clean-text
!pip install -U unidecode

from cleantext import clean

In [None]:
dl_link = "https://www.dropbox.com/s/ewbr4vpqhzd7p84/Messages%20-%20Momma%20Szemraj.csv?dl=1"  # @param {type:"string"}

In [None]:
df = pd.read_csv(dl_link).convert_dtypes()

df.head()

Unnamed: 0,Chat Session,Message Date,Delivered Date,Read Date,Service,Type,Sender ID,Sender Name,Status,Replying to,Subject,Text,Attachment,Attachment type
0,Momma Szemraj,2020-02-10 20:04:42,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,,,Get the supplement that Dad send you pictures....,,
1,Momma Szemraj,2020-02-10 20:05:28,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,,,It maybe a virus. Emilie had that went to see ...,,
2,Momma Szemraj,2020-02-10 20:06:36,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,,,She was taking that liquid iron with B vitamins,,
3,Momma Szemraj,2020-02-10 20:09:39,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,,,Eat ramen with beef instead those salads,,
4,Momma Szemraj,2020-02-10 20:09:47,,,SMS,Outgoing,,,Sent,,,Ok I will,,


In [None]:
# data types etc

df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219 entries, 0 to 218
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Chat Session     219 non-null    string
 1   Message Date     219 non-null    string
 2   Delivered Date   5 non-null      string
 3   Read Date        91 non-null     string
 4   Service          219 non-null    string
 5   Type             219 non-null    string
 6   Sender ID        91 non-null     Int64 
 7   Sender Name      91 non-null     string
 8   Status           219 non-null    string
 9   Replying to      0 non-null      Int64 
 10  Subject          0 non-null      Int64 
 11  Text             205 non-null    string
 12  Attachment       15 non-null     string
 13  Attachment type  15 non-null     string
dtypes: Int64(3), string(11)
memory usage: 24.7 KB


In [None]:
df.dropna(axis=1, how="all", inplace=True)
df.nunique()

Chat Session         1
Message Date       216
Delivered Date       4
Read Date           70
Service              1
Type                 2
Sender ID            1
Sender Name          1
Status               2
Text               196
Attachment          13
Attachment type      3
dtype: int64

In [None]:
df["Type"].value_counts()

Outgoing    128
Incoming     91
Name: Type, dtype: Int64

In [None]:
unique_statuses = list(df["Status"].unique())

print(
    "The type is {} and the length is {}".format(
        type(unique_statuses), len(unique_statuses)
    )
)
import pprint as pp

pp.pprint(unique_statuses)

The type is <class 'list'> and the length is 2
['Read', 'Sent']


---


In [None]:
profile = ProfileReport(
    df, title="Pandas Profiling Report", minimal=True, dark_mode=True
)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# notes

- filter out text=NA
- filter out texts that contain quotes 
- filter out texts that contain "an image"

In [None]:
import pprint as pp

pp.pprint(list(df.columns))

['Chat Session',
 'Message Date',
 'Delivered Date',
 'Read Date',
 'Service',
 'Type',
 'Sender ID',
 'Sender Name',
 'Status',
 'Text',
 'Attachment',
 'Attachment type']


In [None]:
# clean_df = df.copy().convert_dtypes()

# clean_df.dropna(subset=["Text"], inplace=True)
clean_df = df[df.Text.notnull()]
clean_df = clean_df[~clean_df["Text"].str.contains('\n"*."', na=False, regex=False)]
clean_df = clean_df[~clean_df["Text"].str.contains("an image", na=False, regex=False)]
emote_words = ["Liked", "Disliked", "Loved", "Emphasized"]
del_rows = []
for index, row in clean_df.iterrows():

    if len(row["Text"].split(" ")) > 0:
        first_word = row["Text"].split(" ")[0]
        if any(substring in first_word for substring in emote_words):
            del_rows.append(index)

clean_df.drop(del_rows, axis=0, inplace=True)
clean_df.reset_index(drop=True, inplace=True)
clean_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Chat Session     205 non-null    string
 1   Message Date     205 non-null    string
 2   Delivered Date   5 non-null      string
 3   Read Date        88 non-null     string
 4   Service          205 non-null    string
 5   Type             205 non-null    string
 6   Sender ID        88 non-null     Int64 
 7   Sender Name      88 non-null     string
 8   Status           205 non-null    string
 9   Text             205 non-null    string
 10  Attachment       1 non-null      string
 11  Attachment type  1 non-null      string
dtypes: Int64(1), string(11)
memory usage: 19.5 KB


In [None]:
fixedprof = ProfileReport(
    clean_df, title="Cleaned Profiling Report", minimal=True, dark_mode=True
)
fixedprof.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# infer datetime + sort dataframe

In [None]:
from natsort import natsort_keygen

srt_df = clean_df.copy()
srt_df["Message Date"] = pd.to_datetime(
    srt_df["Message Date"], infer_datetime_format=True
)

srt_df.sort_values(
    by="Message Date", key=natsort_keygen(), inplace=True, ascending=True
)

srt_df.head()

Unnamed: 0,Chat Session,Message Date,Delivered Date,Read Date,Service,Type,Sender ID,Sender Name,Status,Text,Attachment,Attachment type
0,Momma Szemraj,2020-02-10 20:04:42,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,Get the supplement that Dad send you pictures....,,
1,Momma Szemraj,2020-02-10 20:05:28,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,It maybe a virus. Emilie had that went to see ...,,
2,Momma Szemraj,2020-02-10 20:06:36,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,She was taking that liquid iron with B vitamins,,
3,Momma Szemraj,2020-02-10 20:09:39,,2020-02-10 20:09:41,SMS,Incoming,18476911653.0,Momma Szemraj,Read,Eat ramen with beef instead those salads,,
4,Momma Szemraj,2020-02-10 20:09:47,,,SMS,Outgoing,,,Sent,Ok I will,,


# New Section

In [None]:
pp.pprint(list(srt_df.columns))

['Chat Session',
 'Message Date',
 'Delivered Date',
 'Read Date',
 'Service',
 'Type',
 'Sender ID',
 'Sender Name',
 'Status',
 'Text',
 'Attachment',
 'Attachment type']


In [None]:
# iterate and build text file


conv_words = []

for index, row in srt_df.iterrows():

    if row["Type"] == "Outgoing":
        conv_words.append("peter szemraj:" + "\n")
    elif pd.notna(row["Sender Name"]):
        conv_words.append(str(row["Sender Name"]) + ":\n")
    else:
        conv_words.append(str(row["Sender ID"]) + ":\n")

    conv_words.append(clean(str(row["Text"])) + "\n")
    conv_words.append("\n")


pp.pprint(conv_words[:10])

['Momma Szemraj:\n',
 'get the supplement that dad send you pictures. get a steak and eat it all. '
 'you should eat once a week\n',
 '\n',
 'Momma Szemraj:\n',
 'it maybe a virus. emilie had that went to see dr. nelson did blood work and '
 "didn't show anything\n",
 '\n',
 'Momma Szemraj:\n',
 'she was taking that liquid iron with b vitamins\n',
 '\n',
 'Momma Szemraj:\n']


## save to file

In [None]:
from datetime import date
import os
from os.path import join

In [None]:
today_string = date.today().strftime("%b-%d-%Y")
comp_data_name = "compiled_message_data_{}.txt".format(today_string)
f_out_path = join(os.getcwd(), comp_data_name)

with open(f_out_path, "w", encoding="utf-8", errors="ignore") as fo:
    fo.writelines(conv_words)

In [None]:
from google.colab import files

files.download(f_out_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>