# View all available datasets

In [1]:
import os

print("List of all available datasets:")
for entry in os.scandir('./data'):
    if entry.is_file():
        print("-- " + entry.name)

List of all available datasets:
-- mergedData2.csv
-- demographic.csv
-- cleaned_hm.csv
-- mergedData.csv


---
# Import necessary libraries

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_colwidth = 1000

---
## Demographic dataset explorations

In [4]:
data_demo = pd.read_csv("data/demographic.csv")

In [5]:
data_demo.shape

(10844, 6)

In [6]:
data_demo.columns

Index(['wid', 'age', 'country', 'gender', 'marital', 'parenthood'], dtype='object')

In [7]:
data_demo.sample(5)

Unnamed: 0,wid,age,country,gender,marital,parenthood
454,455,29.0,IND,m,married,y
4906,5367,27.0,USA,f,single,n
2113,2128,21.0,USA,f,single,n
1217,1223,23.0,TUR,m,single,n
6384,7455,26.0,ITA,m,single,n


In [8]:
print("Number of unique countries:", data_demo['country'].unique().shape[0])
print("which are:", data_demo['country'].unique())

Number of unique countries: 101
which are: ['USA' 'IND' 'VNM' 'THA' 'GBR' 'SRB' 'BGR' 'BRA' 'BGD' 'CAN' 'PRT' 'PHL'
 'VEN' 'MKD' 'DOM' 'DNK' 'IDN' 'AUS' 'URY' 'JAM' 'EGY' 'DEU' 'KEN' 'MDA'
 'IRL' 'NGA' 'ALB' 'MEX' 'TTO' 'KWT' 'RUS' 'LTU' 'PRI' 'SWE' 'PER' 'PAK'
 'TUR' 'FRA' 'ARE' 'COL' 'GRC' 'ROU' 'ARM' 'MAC' 'SGP' 'UGA' 'POL' 'NIC'
 'JPN' 'ESP' 'TWN' 'FIN' 'ITA' 'GHA' 'AUT' 'NPL' 'NZL' 'NLD' 'KOR' 'LKA'
 'KNA' 'ZAF' 'BEL' 'ISL' 'DZA' 'AFG' 'GMB' nan 'ASM' 'MYS' 'CZE' 'MLT'
 'SVN' 'UMI' 'ARG' 'SAU' 'EST' 'ECU' 'ZMB' 'CHL' 'MAR' 'ISR' 'LVA' 'KAZ'
 'NOR' 'CRI' 'BRB' 'HRV' 'ETH' 'TCA' 'BHS' 'VIR' 'GTM' 'IRQ' 'HKG' 'MUS'
 'TUN' 'SUR' 'SLV' 'CYP' 'UKR']


In [9]:
print("Number of unique marital status:", data_demo['marital'].unique().shape[0])
print("which are:", data_demo['marital'].unique())

Number of unique marital status: 6
which are: ['married' 'single' 'divorced' 'separated' 'widowed' nan]


---
## Cleaned dataset explorations

In [10]:
data_happy = pd.read_csv("data/cleaned_hm.csv")

In [11]:
data_happy.shape

(100535, 9)

In [12]:
data_happy.columns

Index(['hmid', 'wid', 'reflection_period', 'original_hm', 'cleaned_hm',
       'modified', 'num_sentence', 'ground_truth_category',
       'predicted_category'],
      dtype='object')

In [13]:
data_happy.sample(5)

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
95970,124174,78,24h,"My boss and I have sushi for lunch today, it was beyond delicious.","My boss and I have sushi for lunch today, it was beyond delicious.",True,1,,achievement
63769,91773,26,3m,I attended a birthday party where the magician chose me as the main host in showing all the magic,I attended a birthday party where the magician chose me as the main host in showing all the magic,True,1,,enjoy_the_moment
92388,120575,4910,24h,"I made progress on a song that I was working on, and it made me feel as if I'm really making progress as a musician.","I made progress on a song that I was working on, and it made me feel as if I'm really making progress as a musician.",True,1,,achievement
64645,92653,10657,3m,The day before yesterday my children and I went to watch the ring of fire at the Keeper of the Plains -it gave me great joy to watch their faces as the fires were burning.,The day before yesterday my children and I went to watch the ring of fire at the Keeper of the Plains -it gave me great joy to watch their faces as the fires were burning.,True,1,affection,affection
46063,73986,12843,3m,"A small event that made me happy in the past two weeks was going out to eat at a fantastic and expensive restaurant that was just built in our city of Fort Wayne, Indiana. The food was incredible, and I had not eaten much during the day, which made it taste all the better (and maybe better than it should have!), leading to an eventual overindulgence.","A small event that made me happy in the past two weeks was going out to eat at a fantastic and expensive restaurant that was just built in our city of Fort Wayne, Indiana. The food was incredible, and I had not eaten much during the day, which made it taste all the better (and maybe better than it should have!), leading to an eventual overindulgence.",True,3,leisure,leisure


In [14]:
print("Number of unique reflection periods:", data_happy['reflection_period'].unique().shape[0])
print("which are:", data_happy['reflection_period'].unique())

Number of unique reflection periods: 2
which are: ['24h' '3m']


In [15]:
print("Number of unique types of number of sentences:", data_happy['num_sentence'].unique().shape[0])
print("which are:", data_happy['num_sentence'].unique())

Number of unique types of number of sentences: 47
which are: [ 1  2  3  9  6  4 53  5 12 16  7 13 11 28  8 10 14 17 25 27 37 34 21 23
 19 26 24 22 31 18 58 32 15 29 35 56 51 20 46 42 30 60 69 40 48 44 45]


In [16]:
print("Number of unique ground truth categories:", data_happy['ground_truth_category'].unique().shape[0])
print("which are:", data_happy['ground_truth_category'].unique())

Number of unique ground truth categories: 8
which are: [nan 'bonding' 'leisure' 'affection' 'enjoy_the_moment' 'achievement'
 'nature' 'exercise']


In [17]:
print("Number of unique predicted categories:", data_happy['predicted_category'].unique().shape[0])
print("which are:", data_happy['predicted_category'].unique())

Number of unique predicted categories: 7
which are: ['affection' 'exercise' 'bonding' 'leisure' 'achievement'
 'enjoy_the_moment' 'nature']


---
# Qualitative analysis

In [18]:
data = pd.read_csv("data/mergedData2.csv")

In [20]:
data.columns

Index(['wid', 'hmid', 'reflection_period', 'original_hm', 'cleaned_hm',
       'modified', 'num_sentence', 'ground_truth_category',
       'predicted_category', 'age', 'country', 'gender', 'marital',
       'parenthood'],
      dtype='object')

### Dimensions of dataset

Features: age group, country, gender, marital status, parenthood, reflection period


Outcome: predicted category, unigram, bigram

In [21]:
data_group_1 = data[(data["country"] == "USA") & (data["marital"] == "married") & (data["reflection_period"] == "3m")]

In [22]:
data_group_1.shape

(15309, 14)

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
word_vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(data_group_1['cleaned_hm'])
frequencies = sum(sparse_matrix).toarray()[0]
bag_of_words = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency', 'n-gram'])

ValueError: Shape of passed values is (1, 95118), indices imply (2, 95118)

In [None]:
nnnn = bag_of_words.sort_values('frequency')

In [None]:
nnnn.tail(100)