In [90]:
#opened and printed out the first 5 rows of the dataframe(df).
import pandas as pd 
# Load all three CSV files
file_paths = [
    "../data/full_dataset/goemotions_1.csv",
    "../data/full_dataset/goemotions_2.csv",
    "../data/full_dataset/goemotions_3.csv"
]
dfs = [pd.read_csv(path) for path in file_paths]

# Combine into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print(df.head())


                                                text       id  \
0                                    That game hurt.  eew5j0j   
1   >sexuality shouldn’t be a grouping category I...  eemcysk   
2     You do right, if you don't care then fuck 'em!  ed2mah1   
3                                 Man I love reddit.  eeibobj   
4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   
2             Labalool          confessions  t3_abru74  t1_ed2m7g7   
3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1                 False           0  ...     0   
1  1.548084e+09        37               

In [91]:
#ensuring that it is a human-readable date.
df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s")
#df must not include examples that aren't clear
df = df[df["example_very_unclear"] == False]
#dropping unwanted columns
df.drop(columns=["id","author","subreddit","link_id","parent_id","rater_id","example_very_unclear"], inplace=True)
#double-checking that only wanted columns are present.
print(df.columns)


Index(['text', 'created_utc', 'admiration', 'amusement', 'anger', 'annoyance',
       'approval', 'caring', 'confusion', 'curiosity', 'desire',
       'disappointment', 'disapproval', 'disgust', 'embarrassment',
       'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
       'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
       'sadness', 'surprise', 'neutral'],
      dtype='object')


In [92]:
#Getting familiar with the dataset.
#shows how many rows and columns are present. (rows, column).
df.shape


(207814, 30)

In [93]:
#provides datatype, number of n/a , non-null records and memory usage.
df.info()
#ensuring there's no null values in our data set
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 207814 entries, 0 to 211223
Data columns (total 30 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   text            207814 non-null  object        
 1   created_utc     207814 non-null  datetime64[ns]
 2   admiration      207814 non-null  int64         
 3   amusement       207814 non-null  int64         
 4   anger           207814 non-null  int64         
 5   annoyance       207814 non-null  int64         
 6   approval        207814 non-null  int64         
 7   caring          207814 non-null  int64         
 8   confusion       207814 non-null  int64         
 9   curiosity       207814 non-null  int64         
 10  desire          207814 non-null  int64         
 11  disappointment  207814 non-null  int64         
 12  disapproval     207814 non-null  int64         
 13  disgust         207814 non-null  int64         
 14  embarrassment   207814 non-null  int64   

text              0
created_utc       0
admiration        0
amusement         0
anger             0
annoyance         0
approval          0
caring            0
confusion         0
curiosity         0
desire            0
disappointment    0
disapproval       0
disgust           0
embarrassment     0
excitement        0
fear              0
gratitude         0
grief             0
joy               0
love              0
nervousness       0
optimism          0
pride             0
realization       0
relief            0
remorse           0
sadness           0
surprise          0
neutral           0
dtype: int64

In [94]:
#removing rows that have no emotions at all
emotions_column= df.columns[2:-1]
df["sum"]=df[emotions_column].sum(axis= 1)
df= df[df["sum"]>0]
df.drop(columns=["sum"], inplace= True)
print(emotions_column)

df.head()

Index(['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise'],
      dtype='object')


Unnamed: 0,text,created_utc,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,2019-01-25 01:50:39,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Man I love reddit.,2019-01-20 06:17:34,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,Right? Considering it’s such an important docu...,2019-01-23 21:50:08,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,"He isn't as big, but he's still quite popular....",2019-01-01 05:21:16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,That's crazy; I went to a super [RELIGION] hig...,2019-01-03 17:22:38,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [95]:
#preprocessed data put in a new file
df.to_csv("../data/processed/goemotions_clean.csv", index=False)
