# Clean posts data

In [1]:
import pandas as pd
import numpy as np
import os
import re
import ast
pd.set_option('display.max_columns', None)

### Đọc tất cả các file csv trong thư mục data/vebay69/raw bắt đầu bằng "posts_"

In [2]:
def read_all_raw_data(
    folder_path=os.path.join('data', 'vebay69', 'raw')
):
    posts_df = pd.DataFrame()

    for file in os.listdir(folder_path):
        if file.endswith(".csv") and file.startswith("posts_"):
            temp_df = pd.read_csv(os.path.join(folder_path, file))
            posts_df = pd.concat([posts_df, temp_df], ignore_index=True)

    return posts_df
posts_df = read_all_raw_data()

In [3]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 51 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   post_id                        1006 non-null   int64  
 1   text                           1006 non-null   object 
 2   post_text                      1006 non-null   object 
 3   shared_text                    30 non-null     object 
 4   original_text                  2 non-null      object 
 5   time                           1006 non-null   object 
 6   timestamp                      993 non-null    float64
 7   image                          967 non-null    object 
 8   image_lowquality               1006 non-null   object 
 9   images                         1005 non-null   object 
 10  images_description             1005 non-null   object 
 11  images_lowquality              1006 non-null   object 
 12  images_lowquality_description  1006 non-null   o

### Xóa các cột không có dữ liệu: "0 non-null"

In [4]:
empty_columns = []
for col in posts_df.columns:
    if posts_df[col].count() == 0:
        empty_columns.append(col)
print(len(empty_columns), empty_columns)

11 ['video', 'video_duration_seconds', 'video_height', 'video_id', 'video_quality', 'video_size_MB', 'video_thumbnail', 'video_watches', 'video_width', 'factcheck', 'sharers']


In [5]:
posts_df.dropna(axis=1, how='all', inplace=True)
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   post_id                        1006 non-null   int64  
 1   text                           1006 non-null   object 
 2   post_text                      1006 non-null   object 
 3   shared_text                    30 non-null     object 
 4   original_text                  2 non-null      object 
 5   time                           1006 non-null   object 
 6   timestamp                      993 non-null    float64
 7   image                          967 non-null    object 
 8   image_lowquality               1006 non-null   object 
 9   images                         1005 non-null   object 
 10  images_description             1005 non-null   object 
 11  images_lowquality              1006 non-null   object 
 12  images_lowquality_description  1006 non-null   o

### Tạo 1 DataFrame để đếm số lượng các unique values trong dữ liệu

In [6]:
def get_unique_values_posts(posts_df: pd.DataFrame) -> pd.DataFrame:
    unique_values_posts = pd.DataFrame(columns=['Column', 'Unique Values', 'Numbers'])
    for col in posts_df.columns:
        unique_values = posts_df[col].value_counts()
        unique_values_posts = pd.concat(
            [   
                unique_values_posts, 
                pd.DataFrame({'Column': col, 'Unique Values': unique_values.index, 'Numbers': unique_values.values})
            ], 
            ignore_index=True
        )
    return unique_values_posts
unique_values_posts = get_unique_values_posts(posts_df)
unique_values_posts

Unnamed: 0,Column,Unique Values,Numbers
0,post_id,6389286101191527,2
1,post_id,666154968953044,2
2,post_id,5783494788437331,2
3,post_id,5781365631983580,2
4,post_id,5409327575854056,2
...,...,...,...
18649,fetched_time,2023-11-20 08:09:24.512043,1
18650,fetched_time,2023-11-20 08:09:36.213784,1
18651,fetched_time,2023-11-20 08:09:42.426295,1
18652,fetched_time,2023-11-20 08:09:55.402730,1


### Lọc ra và xóa các cột có đúng 1 value cho mọi hàng

In [7]:
del_col = unique_values_posts[unique_values_posts['Numbers'] == posts_df.shape[0]]
del_col

Unnamed: 0,Column,Unique Values,Numbers
12141,username,Vẽ bậy,1006
13148,is_live,False,1006
13155,available,True,1006
15873,page_id,772954189491441,1006
17653,was_live,False,1006


In [8]:
posts_df.drop(columns=del_col.Column, inplace=True)

In [9]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 35 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   post_id                        1006 non-null   int64  
 1   text                           1006 non-null   object 
 2   post_text                      1006 non-null   object 
 3   shared_text                    30 non-null     object 
 4   original_text                  2 non-null      object 
 5   time                           1006 non-null   object 
 6   timestamp                      993 non-null    float64
 7   image                          967 non-null    object 
 8   image_lowquality               1006 non-null   object 
 9   images                         1005 non-null   object 
 10  images_description             1005 non-null   object 
 11  images_lowquality              1006 non-null   object 
 12  images_lowquality_description  1006 non-null   o

### Xóa các hàng dữ liệu bị trùng lặp
Có 1 vài post bị trùng lặp trong quá trình crawl, chúng ta cần loại bỏ chúng và giữ lại 1 hàng duy nhất cho mỗi bài post

In [10]:
unique_values_posts[ 
    (unique_values_posts['Column'] == "post_id") & 
    (unique_values_posts['Numbers'] > 1)]

Unnamed: 0,Column,Unique Values,Numbers
0,post_id,6389286101191527,2
1,post_id,666154968953044,2
2,post_id,5783494788437331,2
3,post_id,5781365631983580,2
4,post_id,5409327575854056,2
...,...,...,...
87,post_id,5987235421396599,2
88,post_id,5985066091613532,2
89,post_id,5984686764984798,2
90,post_id,5990959687690839,2


In [11]:
posts_df[posts_df['post_id'] == 5417956751657805]

Unnamed: 0,post_id,text,post_text,shared_text,original_text,time,timestamp,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,post_url,link,links,user_id,user_url,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reactions,reaction_count,with,image_id,image_ids,fetched_time
411,5417956751657805,Cô bán bún chả be like\n#Lìquầnxòi,Cô bán bún chả be like\n#Lìquầnxòi,,,2022-06-29 07:50:52,1656489000.0,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],4866,303,53,https://facebook.com/vebay69/posts/54179567516...,,[{'link': '/hashtag/lìquầnxòi?_ft_=encrypted_t...,772954189491441,https://facebook.com/vebay69?_ft_=encrypted_tr...,,,,,,,[],https://www.facebook.com/vebay69/posts/5417956...,"{'like': 4866, 'love': 15, 'haha': 3290, 'wow'...",8255,,5417956000000000.0,['5417956358324511'],2023-11-21 08:18:02.441933
877,5417956751657805,Cô bán bún chả be like\n#Lìquầnxòi,Cô bán bún chả be like\n#Lìquầnxòi,,,2022-06-29 07:50:52,1656489000.0,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],4866,303,53,https://facebook.com/vebay69/posts/54179567516...,,[{'link': '/hashtag/lìquầnxòi?_ft_=encrypted_t...,772954189491441,https://facebook.com/vebay69?_ft_=encrypted_tr...,,,,,,,[],https://www.facebook.com/vebay69/posts/5417956...,"{'like': 4866, 'love': 15, 'haha': 3290, 'wow'...",8255,,5417956000000000.0,['5417956358324511'],2023-11-21 08:06:23.963421


Sắp xếp lại dữ liệu theo cột "fetched_time" mới nhất, tức thời gian crawl post đó mới nhất. Giữ lại các hàng dữ liệu theo thuộc tính "fetched_time" mới nhất trong các hàng dữ liệu bị trùng này.

In [12]:
def drop_duplicates_keep_earliest(df, subset='post_id', time_column='fetched_time'):
    df.sort_values(by=time_column, ascending=True, inplace=True)
    df.drop_duplicates(subset=subset, keep='first', inplace=True)
    return df
posts_df = drop_duplicates_keep_earliest(posts_df)

In [13]:
posts_df.reset_index(inplace=True, drop=True)
posts_df.shape

(914, 35)

In [14]:
posts_df.head()

Unnamed: 0,post_id,text,post_text,shared_text,original_text,time,timestamp,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,post_url,link,links,user_id,user_url,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reactions,reaction_count,with,image_id,image_ids,fetched_time
0,685638947004646,Mùa đông chưa vậy\n#Panda,Mùa đông chưa vậy\n#Panda,,,2023-11-20 05:28:02,1700458000.0,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Lạnh quá ...",783,24,10,https://facebook.com/vebay69/posts/68563894700...,,[{'link': '/hashtag/panda?refid=17&_ft_=encryp...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6856389...,"{'like': 783, 'love': 6, 'haha': 396, 'wow': 3...",1193,,685638700000000.0,['685638737004667'],2023-11-20 08:04:44.252150
1,685577330344141,"Con chịu rồi, cái này con không học được\n#Vui...","Con chịu rồi, cái này con không học được\n#Vui...",,,2023-11-20 04:00:02,1700453000.0,,https://scontent-sin6-2.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-2.xx.fbcdn.net/v/t39.3...,"[""May be pop art of text that says 'Học tập an...",3421,43,0,https://facebook.com/vebay69/posts/68557733034...,,[{'link': '/hashtag/vuilavietlott?refid=17&_ft...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6855773...,"{'like': 3421, 'haha': 632, 'sad': 1}",4054,,685576700000000.0,['685576747010866'],2023-11-20 08:04:46.677532
2,685111467057394,Bạn tồiiiiiiiiii\n#Muonggg,Bạn tồiiiiiiiiii\n#Muonggg,,,2023-11-19 08:54:12,1700384000.0,https://m.facebook.com/photo/view_full_size/?f...,https://scontent-sin6-4.xx.fbcdn.net/v/t39.308...,['https://m.facebook.com/photo/view_full_size/...,['May be a doodle of text'],['https://scontent-sin6-4.xx.fbcdn.net/v/t39.3...,['May be a doodle of text'],241,31,7,https://facebook.com/vebay69/posts/68511146705...,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6851114...,"{'like': 241, 'love': 6, 'haha': 196, 'wow': 6...",455,,685111300000000.0,['685111340390740'],2023-11-20 08:04:52.194919
3,684686633766544,Trí khôn của ta đây 😀\n#tayduvng #VNGGames,Trí khôn của ta đây 😀\n#tayduvng #VNGGames,,,2023-11-18 13:00:10,1700312000.0,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",2884,70,8,https://facebook.com/vebay69/posts/68468663376...,,[{'link': '/hashtag/tayduvng?refid=17&_ft_=enc...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6846866...,"{'like': 2884, 'love': 1, 'haha': 571, 'wow': ...",3460,,684686600000000.0,['684686610433213'],2023-11-20 08:04:59.901802
4,684603707108170,Team vô sản ✌️\n#Muonggg,Team vô sản ✌️\n#Muonggg,,,2023-11-18 08:34:17,1700296000.0,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['May be an image of text'],1743,32,17,https://facebook.com/vebay69/posts/68460370710...,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6846037...,"{'like': 1743, 'love': 2, 'haha': 180, 'wow': ...",1940,,684603500000000.0,['684603543774853'],2023-11-20 08:05:05.401087


In [15]:
unique_values_posts = get_unique_values_posts(posts_df)

### Xóa cột "original_text", "text"

In [16]:
posts_df[~posts_df['original_text'].isna()]

Unnamed: 0,post_id,text,post_text,shared_text,original_text,time,timestamp,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,post_url,link,links,user_id,user_url,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reactions,reaction_count,with,image_id,image_ids,fetched_time
456,6244522345667904,Làm màu là giỏi\n\nI am good at coloring,Làm màu là giỏi\n\nI am good at coloring,,Làm màu là giỏi,2023-04-05 12:21:36,1680697000.0,https://scontent-sin6-4.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-4.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-4.xx.fbcdn.net/v/t39.3...,['No photo description available.'],['https://scontent-sin6-4.xx.fbcdn.net/v/t39.3...,['No photo description available.'],2176,46,7,https://facebook.com/vebay69/posts/62445223456...,,[{'link': '/story.php?story_fbid=pfbid02RJqDKF...,772954189491441,https://facebook.com/vebay69?_ft_=encrypted_tr...,,,,,,,[],https://www.facebook.com/vebay69/posts/6244522...,"{'like': 2176, 'love': 2, 'haha': 1225, 'wow':...",3411,,6244522000000000.0,['6244522232334582'],2023-11-21 00:35:49.043062
516,6152641401522666,"Đã chơi game online còn đòi ""riêng tư"" ?\n#vut...","Đã chơi game online còn đòi ""riêng tư"" ?\n#vut...",,"Đã chơi game online còn đòi ""riêng tư"" ?\n#vut...",2023-03-06 03:07:27,1678072000.0,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['No photo description available.'],2180,37,3,https://facebook.com/vebay69/posts/61526414015...,,[{'link': '/hashtag/vutruphongthan3d?_ft_=encr...,772954189491441,https://facebook.com/vebay69?_ft_=encrypted_tr...,,,,,,,[],https://www.facebook.com/vebay69/posts/6152641...,"{'like': 2180, 'love': 213, 'haha': 10, 'wow':...",2405,,6152641000000000.0,['6152640874856052'],2023-11-21 03:50:21.956424


In [17]:
posts_df.drop(columns=['original_text', 'text'], inplace=True)

In [18]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 914 entries, 0 to 913
Data columns (total 33 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   post_id                        914 non-null    int64  
 1   post_text                      914 non-null    object 
 2   shared_text                    24 non-null     object 
 3   time                           914 non-null    object 
 4   timestamp                      901 non-null    float64
 5   image                          884 non-null    object 
 6   image_lowquality               914 non-null    object 
 7   images                         913 non-null    object 
 8   images_description             913 non-null    object 
 9   images_lowquality              914 non-null    object 
 10  images_lowquality_description  914 non-null    object 
 11  likes                          914 non-null    int64  
 12  comments                       914 non-null    int

### Tách cột "reactions" thành 7 cột tương ứng với số lượng like, love, haha, wow, care, sad, angry

In [19]:
posts_df['reactions'][0]

"{'like': 783, 'love': 6, 'haha': 396, 'wow': 3, 'care': 3, 'sad': 2}"

### Thay thế các giá trị NaN trong cột reactions bằng 1 chuỗi JSON rỗng

In [20]:
posts_df[posts_df['reactions'].isna()]

Unnamed: 0,post_id,post_text,shared_text,time,timestamp,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,post_url,link,links,user_id,user_url,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reactions,reaction_count,with,image_id,image_ids,fetched_time
911,686124743622733,"Đừng như “Boi phố"" nhé 🤔✋",,2023-11-21 12:00:05,1700568000.0,,https://scontent-sin6-2.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-2.xx.fbcdn.net/v/t39.3...,"[""May be an image of 1 person, scooter, motorc...",3100,31,5,https://facebook.com/vebay69/posts/68612474362...,,[{'link': '/story.php?story_fbid=6861247436227...,100066756416622,https://facebook.com/vebay69?lst=1000101975890...,,,,,,[],,,,3100,,686124000000000.0,['686124020289472'],
912,686099176958623,Lạnh lắm\n#panda,,2023-11-21 03:07:38,1700536000.0,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,"[""May be pop art of one or more people and tex...",734,14,18,https://facebook.com/vebay69/posts/68609917695...,,[{'link': '/hashtag/panda?refid=17&_ft_=encryp...,100066756416622,https://facebook.com/vebay69?lst=1000101975890...,,,,,,[],,,,734,,686099100000000.0,['686099103625297'],
913,685769336991607,Mẹ chủ tịch đi bán vỉa hè 🥲\n#Hoho,,2023-11-20 11:59:18,1700482000.0,,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be a doodle of text that says 'Bà cụ nom...",9500,147,68,https://facebook.com/vebay69/posts/68576933699...,,[{'link': '/hashtag/hoho?refid=17&_ft_=encrypt...,100066756416622,https://facebook.com/vebay69?lst=1000101975890...,,,,,,[],,,,9500,,685769000000000.0,['685769000324974'],


In [21]:
posts_df['reactions'].fillna('{}', inplace=True)

In [22]:
reactions_df = posts_df['reactions'].apply(ast.literal_eval).apply(pd.Series)
reactions_df

Unnamed: 0,like,love,haha,wow,care,sad,angry
0,783.0,6.0,396.0,3.0,3.0,2.0,
1,3421.0,,632.0,,,1.0,
2,241.0,6.0,196.0,6.0,3.0,3.0,
3,2884.0,1.0,571.0,2.0,2.0,,
4,1743.0,2.0,180.0,1.0,1.0,13.0,
...,...,...,...,...,...,...,...
909,1735.0,5.0,1146.0,1.0,1.0,33.0,
910,248.0,4.0,99.0,,2.0,,
911,,,,,,,
912,,,,,,,


### Thay thế các giá trị NaN thành giá trị 0

In [23]:
reactions_df.fillna(0, inplace=True)
reactions_df

Unnamed: 0,like,love,haha,wow,care,sad,angry
0,783.0,6.0,396.0,3.0,3.0,2.0,0.0
1,3421.0,0.0,632.0,0.0,0.0,1.0,0.0
2,241.0,6.0,196.0,6.0,3.0,3.0,0.0
3,2884.0,1.0,571.0,2.0,2.0,0.0,0.0
4,1743.0,2.0,180.0,1.0,1.0,13.0,0.0
...,...,...,...,...,...,...,...
909,1735.0,5.0,1146.0,1.0,1.0,33.0,0.0
910,248.0,4.0,99.0,0.0,2.0,0.0,0.0
911,0.0,0.0,0.0,0.0,0.0,0.0,0.0
912,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Convert từ kiểu float sang int

In [24]:
reactions_df = reactions_df.astype(int)
reactions_df

Unnamed: 0,like,love,haha,wow,care,sad,angry
0,783,6,396,3,3,2,0
1,3421,0,632,0,0,1,0
2,241,6,196,6,3,3,0
3,2884,1,571,2,2,0,0
4,1743,2,180,1,1,13,0
...,...,...,...,...,...,...,...
909,1735,5,1146,1,1,33,0
910,248,4,99,0,2,0,0
911,0,0,0,0,0,0,0
912,0,0,0,0,0,0,0


## Thêm DataFrame reactions_df vào bảng dữ liệu

In [25]:
posts_df = pd.concat([posts_df, reactions_df], axis=1)
posts_df.head()

Unnamed: 0,post_id,post_text,shared_text,time,timestamp,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,post_url,link,links,user_id,user_url,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reactions,reaction_count,with,image_id,image_ids,fetched_time,like,love,haha,wow,care,sad,angry
0,685638947004646,Mùa đông chưa vậy\n#Panda,,2023-11-20 05:28:02,1700458000.0,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Lạnh quá ...",783,24,10,https://facebook.com/vebay69/posts/68563894700...,,[{'link': '/hashtag/panda?refid=17&_ft_=encryp...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6856389...,"{'like': 783, 'love': 6, 'haha': 396, 'wow': 3...",1193,,685638700000000.0,['685638737004667'],2023-11-20 08:04:44.252150,783,6,396,3,3,2,0
1,685577330344141,"Con chịu rồi, cái này con không học được\n#Vui...",,2023-11-20 04:00:02,1700453000.0,,https://scontent-sin6-2.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-2.xx.fbcdn.net/v/t39.3...,"[""May be pop art of text that says 'Học tập an...",3421,43,0,https://facebook.com/vebay69/posts/68557733034...,,[{'link': '/hashtag/vuilavietlott?refid=17&_ft...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6855773...,"{'like': 3421, 'haha': 632, 'sad': 1}",4054,,685576700000000.0,['685576747010866'],2023-11-20 08:04:46.677532,3421,0,632,0,0,1,0
2,685111467057394,Bạn tồiiiiiiiiii\n#Muonggg,,2023-11-19 08:54:12,1700384000.0,https://m.facebook.com/photo/view_full_size/?f...,https://scontent-sin6-4.xx.fbcdn.net/v/t39.308...,['https://m.facebook.com/photo/view_full_size/...,['May be a doodle of text'],['https://scontent-sin6-4.xx.fbcdn.net/v/t39.3...,['May be a doodle of text'],241,31,7,https://facebook.com/vebay69/posts/68511146705...,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6851114...,"{'like': 241, 'love': 6, 'haha': 196, 'wow': 6...",455,,685111300000000.0,['685111340390740'],2023-11-20 08:04:52.194919,241,6,196,6,3,3,0
3,684686633766544,Trí khôn của ta đây 😀\n#tayduvng #VNGGames,,2023-11-18 13:00:10,1700312000.0,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",2884,70,8,https://facebook.com/vebay69/posts/68468663376...,,[{'link': '/hashtag/tayduvng?refid=17&_ft_=enc...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6846866...,"{'like': 2884, 'love': 1, 'haha': 571, 'wow': ...",3460,,684686600000000.0,['684686610433213'],2023-11-20 08:04:59.901802,2884,1,571,2,2,0,0
4,684603707108170,Team vô sản ✌️\n#Muonggg,,2023-11-18 08:34:17,1700296000.0,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['May be an image of text'],1743,32,17,https://facebook.com/vebay69/posts/68460370710...,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,https://facebook.com/vebay69?lst=1000924224764...,,,,,,,[],https://www.facebook.com/vebay69/posts/6846037...,"{'like': 1743, 'love': 2, 'haha': 180, 'wow': ...",1940,,684603500000000.0,['684603543774853'],2023-11-20 08:05:05.401087,1743,2,180,1,1,13,0


## Kiểm tra tính đúng trong quá trình tách

In [26]:
# Test correct
print(np.all(posts_df['likes'] == posts_df['like']))
print(np.all(posts_df['reaction_count'] == posts_df['haha'] + posts_df['like'] + posts_df['love'] + posts_df['sad'] + posts_df['wow'] + posts_df['angry'] + posts_df['care']))

False
False


## Xóa các cột khác không cần thiết

In [27]:
posts_df.drop(columns=['reactions', 'user_url', 'post_url', 'timestamp'], inplace=True)

In [28]:
posts_df.head()

Unnamed: 0,post_id,post_text,shared_text,time,image,image_lowquality,images,images_description,images_lowquality,images_lowquality_description,likes,comments,shares,link,links,user_id,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,comments_full,reactors,w3_fb_url,reaction_count,with,image_id,image_ids,fetched_time,like,love,haha,wow,care,sad,angry
0,685638947004646,Mùa đông chưa vậy\n#Panda,,2023-11-20 05:28:02,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Lạnh quá ...",783,24,10,,[{'link': '/hashtag/panda?refid=17&_ft_=encryp...,100066756416622,,,,,,,[],https://www.facebook.com/vebay69/posts/6856389...,1193,,685638700000000.0,['685638737004667'],2023-11-20 08:04:44.252150,783,6,396,3,3,2,0
1,685577330344141,"Con chịu rồi, cái này con không học được\n#Vui...",,2023-11-20 04:00:02,,https://scontent-sin6-2.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-2.xx.fbcdn.net/v/t39.3...,"[""May be pop art of text that says 'Học tập an...",3421,43,0,,[{'link': '/hashtag/vuilavietlott?refid=17&_ft...,100066756416622,,,,,,,[],https://www.facebook.com/vebay69/posts/6855773...,4054,,685576700000000.0,['685576747010866'],2023-11-20 08:04:46.677532,3421,0,632,0,0,1,0
2,685111467057394,Bạn tồiiiiiiiiii\n#Muonggg,,2023-11-19 08:54:12,https://m.facebook.com/photo/view_full_size/?f...,https://scontent-sin6-4.xx.fbcdn.net/v/t39.308...,['https://m.facebook.com/photo/view_full_size/...,['May be a doodle of text'],['https://scontent-sin6-4.xx.fbcdn.net/v/t39.3...,['May be a doodle of text'],241,31,7,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,,,,,,,[],https://www.facebook.com/vebay69/posts/6851114...,455,,685111300000000.0,['685111340390740'],2023-11-20 08:04:52.194919,241,6,196,6,3,3,0
3,684686633766544,Trí khôn của ta đây 😀\n#tayduvng #VNGGames,,2023-11-18 13:00:10,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,https://scontent-sin6-3.xx.fbcdn.net/v/t39.308...,['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",['https://scontent-sin6-3.xx.fbcdn.net/v/t39.3...,"[""May be an image of text that says 'Làm gì đấ...",2884,70,8,,[{'link': '/hashtag/tayduvng?refid=17&_ft_=enc...,100066756416622,,,,,,,[],https://www.facebook.com/vebay69/posts/6846866...,3460,,684686600000000.0,['684686610433213'],2023-11-20 08:04:59.901802,2884,1,571,2,2,0,0
4,684603707108170,Team vô sản ✌️\n#Muonggg,,2023-11-18 08:34:17,,https://scontent-sin6-1.xx.fbcdn.net/v/t39.308...,[],[],['https://scontent-sin6-1.xx.fbcdn.net/v/t39.3...,['May be an image of text'],1743,32,17,,[{'link': '/hashtag/muonggg?refid=17&_ft_=encr...,100066756416622,,,,,,,[],https://www.facebook.com/vebay69/posts/6846037...,1940,,684603500000000.0,['684603543774853'],2023-11-20 08:05:05.401087,1743,2,180,1,1,13,0


### Lưu dữ liệu đã làm sạch

In [29]:
posts_df.to_csv(os.path.join('data', 'vebay69', 'clean', 'posts_full.csv'), index=False)
posts_df['post_id'].to_csv(os.path.join('data', 'vebay69', 'clean', 'post_ids.csv'), index=False)