In [1]:
import pandas as pd

In [2]:
# Collecting all the invalid post IDs to one text file

def collect_unique_post_ids(input_files, output_file):
    unique_post_ids = set()

    for file_name in input_files:
        with open(file_name, 'r') as file:
            for line in file:
                post_id = line.strip()
                if post_id:
                    unique_post_ids.add(post_id)

    with open(output_file, 'w') as output:
        for post_id in unique_post_ids:
            output.write(post_id + '\n')

if __name__ == "__main__":
    input_files = ["invalid_posts - Kethaka.txt", "invalid_posts - Kethaka3.txt", "invalid_posts - Sam.txt", "invalid_posts - Sam3.txt", "invalid_posts - Sama.txt", "invalid_posts - Shikara.txt", "invalid_posts - Singhe.txt", "invalid_posts - Singhe2.txt"]
    output_file = "all_invalid_post_ids.txt"

    collect_unique_post_ids(input_files, output_file)

In [3]:
# Print the number of invalid Post IDs

def count_post_ids(output_file):
    with open(output_file, 'r') as file:
        post_ids = file.read().splitlines()
    return len(post_ids)

if __name__ == "__main__":
    output_file = "all_invalid_post_ids.txt"

    num_post_ids = count_post_ids(output_file)
    print(f"Number of post IDs in {output_file}: {num_post_ids}")

Number of post IDs in all_invalid_post_ids.txt: 10216


In [4]:
df = pd.read_excel('withInvalidPostIDs1.xlsx')
df

Unnamed: 0,username,followers,followings,likes,COMMENTS,description,date,num_tags,location,post_id,hashtags,num_hashtags,day_of_week,mention_count,total_points,hour
0,giannis_an34,804699,107,109037,769,Everyday!,2017-04-21 20:18:34,1,,BTJ89VWBxDX,,0,5,0,0,21
1,giannis_an34,804699,107,158338,1811,While walking off the court today fans were te...,2017-04-28 07:16:31,0,,BTalBkFFbS3,,0,5,0,0,8
2,giannis_an34,804699,107,51447,342,#NationalSiblingsDay @francisadetokunbo @thana...,2017-04-11 07:08:23,2,,BSuylNHBDUq,#NationalSiblingsDay,1,2,2,5440,8
3,giannis_an34,804699,107,40114,77,❤️ #NationalSiblingsDay,2017-04-11 06:59:43,3,,BSuxlqrBSBp,#NationalSiblingsDay,1,2,0,5440,7
4,giannis_an34,804699,107,62250,102,The best brothers/Friends ever. They always ha...,2017-04-11 06:52:56,3,,BSuwz-FBb_z,#NationalSiblingsDay,1,2,3,5440,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25065,natsakos_,14954,2216,538,1,Your eyes speak the truth when everything else...,2017-04-11 21:09:52,-,-,BSwS4UqBLj0,,0,2,0,0,22
25066,ioni_91,14104,744,69,0,La bellezza che offre Firenze da ogni angolo?❤...,2017-04-11 19:24:30,-,-,BSwG0qRF0je,"#paris,#france,#türkiye,#istanbul,#germany,#ru...",49,2,0,332714,20
25067,iliada_t,13352,303,193,0,And forget not that the earth delights to feel...,2017-04-08 19:05:17,-,-,BSoWPN6DJ87,,0,6,0,0,20
25068,iliada_t,13352,303,156,3,??????,2017-04-09 19:23:30,-,-,BSq9Hk4jR-I,,0,7,0,0,20


In [5]:
# drop the duplicates

df = df.drop_duplicates(subset='post_id')
df

Unnamed: 0,username,followers,followings,likes,COMMENTS,description,date,num_tags,location,post_id,hashtags,num_hashtags,day_of_week,mention_count,total_points,hour
0,giannis_an34,804699,107,109037,769,Everyday!,2017-04-21 20:18:34,1,,BTJ89VWBxDX,,0,5,0,0,21
1,giannis_an34,804699,107,158338,1811,While walking off the court today fans were te...,2017-04-28 07:16:31,0,,BTalBkFFbS3,,0,5,0,0,8
2,giannis_an34,804699,107,51447,342,#NationalSiblingsDay @francisadetokunbo @thana...,2017-04-11 07:08:23,2,,BSuylNHBDUq,#NationalSiblingsDay,1,2,2,5440,8
3,giannis_an34,804699,107,40114,77,❤️ #NationalSiblingsDay,2017-04-11 06:59:43,3,,BSuxlqrBSBp,#NationalSiblingsDay,1,2,0,5440,7
4,giannis_an34,804699,107,62250,102,The best brothers/Friends ever. They always ha...,2017-04-11 06:52:56,3,,BSuwz-FBb_z,#NationalSiblingsDay,1,2,3,5440,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25065,natsakos_,14954,2216,538,1,Your eyes speak the truth when everything else...,2017-04-11 21:09:52,-,-,BSwS4UqBLj0,,0,2,0,0,22
25066,ioni_91,14104,744,69,0,La bellezza che offre Firenze da ogni angolo?❤...,2017-04-11 19:24:30,-,-,BSwG0qRF0je,"#paris,#france,#türkiye,#istanbul,#germany,#ru...",49,2,0,332714,20
25067,iliada_t,13352,303,193,0,And forget not that the earth delights to feel...,2017-04-08 19:05:17,-,-,BSoWPN6DJ87,,0,6,0,0,20
25068,iliada_t,13352,303,156,3,??????,2017-04-09 19:23:30,-,-,BSq9Hk4jR-I,,0,7,0,0,20


In [6]:
# Remove the datapoints with invalid post IDs

def remove_rows_by_post_ids(df, post_ids_file):
    with open(post_ids_file, 'r') as file:
        post_ids = file.read().splitlines()

    df = df[~df['post_id'].isin(post_ids)]
    return df

if __name__ == "__main__":
    file_path = "withInvalidPostIDs1.xlsx"
    post_ids_file = "all_invalid_post_ids.txt"

    df = remove_rows_by_post_ids(df, post_ids_file)  
    df  

In [7]:
df

Unnamed: 0,username,followers,followings,likes,COMMENTS,description,date,num_tags,location,post_id,hashtags,num_hashtags,day_of_week,mention_count,total_points,hour
1,giannis_an34,804699,107,158338,1811,While walking off the court today fans were te...,2017-04-28 07:16:31,0,,BTalBkFFbS3,,0,5,0,0,8
2,giannis_an34,804699,107,51447,342,#NationalSiblingsDay @francisadetokunbo @thana...,2017-04-11 07:08:23,2,,BSuylNHBDUq,#NationalSiblingsDay,1,2,2,5440,8
4,giannis_an34,804699,107,62250,102,The best brothers/Friends ever. They always ha...,2017-04-11 06:52:56,3,,BSuwz-FBb_z,#NationalSiblingsDay,1,2,3,5440,7
5,elenimenegaki,636249,78,49729,670,#eleni #elenimenegaki #blackandwhite,2017-04-13 09:28:30,0,,BS0MNFuBU0a,"#eleni,#elenimenegaki,#blackandwhite",3,4,0,9914,10
6,elenimenegaki,636249,78,46541,538,#eleni #elenimenegaki Φόρεμα @serendipitygr #s...,2017-04-18 14:33:20,1,,BTBnELSBPu4,"#eleni,#elenimenegaki,#styling",3,2,2,7866,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25062,giannis_anestis,15484,321,3114,21,2️⃣2️⃣,2017-04-09 22:42:57,-,-,BSrT8Wqj1ux,,0,7,0,0,23
25066,ioni_91,14104,744,69,0,La bellezza che offre Firenze da ogni angolo?❤...,2017-04-11 19:24:30,-,-,BSwG0qRF0je,"#paris,#france,#türkiye,#istanbul,#germany,#ru...",49,2,0,332714,20
25067,iliada_t,13352,303,193,0,And forget not that the earth delights to feel...,2017-04-08 19:05:17,-,-,BSoWPN6DJ87,,0,6,0,0,20
25068,iliada_t,13352,303,156,3,??????,2017-04-09 19:23:30,-,-,BSq9Hk4jR-I,,0,7,0,0,20


In [9]:
df.to_excel('withoutInvalidPostIDs.xlsx', index=False)

In [10]:
import os

# Folder path where the .jpg files are located
folder_path = 'F:/Needed stuff/Computer Science/Research stuff/Kethakas posts/All in all!'

# Get a list of all .jpg files in the folder
jpg_files = [file for file in os.listdir(folder_path) if file.lower().endswith('.jpg')]

# Extract names without the '.jpg' extension
file_names_without_extension = [os.path.splitext(file)[0] for file in jpg_files]

# Name of the output text file
output_file = 'all_jpg_files.txt'

# Write the names to the text file
with open(output_file, 'w') as file:
    for name in file_names_without_extension:
        file.write(name + '\n')

In [11]:
# Read the content of 'jpg_files.txt' and store the lines in a list
with open('all_jpg_files.txt', 'r') as file:
    lines = file.readlines()

# Extract postIDs from the lines (remove '\n' from each line)
post_ids = [line.strip() for line in lines]

# Count the number of unique postIDs
unique_post_ids = set(post_ids)
num_unique_post_ids = len(unique_post_ids)

# Display the result
print("Number of unique postIDs:", num_unique_post_ids)

Number of unique postIDs: 15571


In [12]:
# Read the content of 'jpg_files.txt' and store the lines in a list
with open('all_jpg_files.txt', 'r') as file:
    lines = file.readlines()

# Extract postIDs from the lines (remove '\n' from each line)
post_ids_to_remove = [line.strip() for line in lines]

# Determine the data type of the 'post_id' column in the DataFrame
if df['post_id'].dtype == int:
    post_ids_to_remove = [int(post_id) for post_id in post_ids_to_remove]
else:
    # If the 'post_id' column is not integers, keep it as a list of strings
    pass

# Remove rows from the DataFrame where 'post_id' matches any of the postIDs in 'jpg_files.txt'
filtered_df = df[~df['post_id'].isin(post_ids_to_remove)]

# Display the filtered DataFrame
filtered_df

Unnamed: 0,username,followers,followings,likes,COMMENTS,description,date,num_tags,location,post_id,hashtags,num_hashtags,day_of_week,mention_count,total_points,hour
5598,sport24,49785,177,2975,96,Ολυμπιακός και Εφές Αναντολού θα διεκδικήσουν ...,2017-04-30 00:28:29,0,,BTe_60mh6_3,"#Devotion,#EuroLeague,#Olympiacos,#Spanoulis,#...",5,7,0,9197,1
7524,portretexclusiveboutique,40814,1750,412,2,??,2017-04-19 22:05:25,0,,BTE_mMkDlut,,0,3,0,0,23
10952,ancientmemes,31481,85,4386,17,#life_greece #greecetravelgr1_ #kings_greece #...,2017-04-30 00:23:00,-,-,BTe_Sp3hXF9,"#life_greece,#greecetravelgr1_,#kings_greece,#...",19,7,0,6808,1
23642,ckourampis,12838,702,564,0,#Caldera view 3:2,2017-04-21 21:47:28,-,-,BTKHIfZlE_t,#Caldera,1,5,0,0,22
23643,ckourampis,12838,702,359,0,Wo @alice_gao ・・・summer memories,2017-04-22 21:43:02,-,-,BTMra22lE1D,,0,6,1,0,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23969,dina.nikolaou,12625,605,308,4,#oeufsdepaques #oeufs #lovemonvillage #dina_ni...,2017-04-15 21:59:29,-,-,BS6rvP4lxyi,"#oeufsdepaques,#oeufs,#lovemonvillage,#dina_ni...",10,6,0,19838,22
23970,dina.nikolaou,12625,605,406,4,#paques #prentemps2017 #ανοιξη2017 #πασχα2017 ...,2017-04-15 21:49:36,-,-,BS6qm2SlAJt,"#paques,#prentemps2017,#ανοιξη2017,#πασχα2017,...",10,6,0,15838,22
23971,dina.nikolaou,12625,605,280,0,#paques #cuisine_grecque #jukeros #chutneylove...,2017-04-16 16:56:43,-,-,BS8t4p0Fln_,"#paques,#cuisine_grecque,#jukeros,#chutneylove...",9,7,2,5919,17
24321,interiorlifestylebyme,12323,681,325,35,God kväll finingar ? Hoppas ni har haft en här...,2017-04-19 22:01:36,-,-,BTE_KOJhqtt,,0,3,0,0,23


In [13]:
filtered_df.to_excel('toDownloadjsons2.xlsx', index=False)