In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import multiprocessing
from multiprocessing import Pool
import string
#defining the function to remove punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
import ipyplot

import os
os.environ["CUDA_VISIBLE_DEVICES"]='2'   # specify which GPU(s) to be used

In [4]:
def remove_whitespace(text):
    return text.strip()

def lowering(text):
    return text.lower()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    return "".join([i for i in text if i not in string.punctuation])

def remove_stopword(text):
    tokens = word_tokenize(text)
    english_stopwords = stopwords.words('english')
    tokens_wo_stopwords = [t for t in tokens if t not in english_stopwords]
    return " ".join(tokens_wo_stopwords)

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return " ".join(stems)
 
def text_preprocessing(text):
    text_1 = remove_whitespace(text)
    text_2 = lowering(text_1)
    text_3 = remove_numbers(text_2)
    text_4 = remove_punctuation(text_3)
    text_5 = remove_stopword(text_4)
    return text_5

# CC12M vs LAION

## Processing CC12M


In [3]:
path_dir_CC12M = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/Open_clip_training/src/data/cc12m/'):
    for filename in filenames:
        if filename.split('.')[-1] == 'parquet':
            path_dir_CC12M.append(os.path.join(dirname, filename))

In [4]:
def collect_cc12M_data(path):
    df_cc12M = pd.read_parquet(path)
    N_cc12M = len(df_cc12M)  # number of samples from one parquet of laion 
    small_cc12M = df_cc12M.sample(n=N_cc12M)

    cc12M_texts_array = small_cc12M["caption"].iloc[0:N_cc12M].values
    return cc12M_texts_array.tolist()

In [5]:
## cal feature info 
start_time = time.time()

processed_CC12M_text = []
for i in range(len(path_dir_CC12M)):
    cc12M_texts = collect_cc12M_data(path_dir_CC12M[i])
    
    cc12M_texts_re = []
    for val in cc12M_texts:
        if val != None :
            cc12M_texts_re.append(val)
    processed_CC12M_text.extend(cc12M_texts_re)
print("--- %s seconds ---" % (time.time() - start_time))

--- 156.33391046524048 seconds ---


In [6]:
print(len(processed_CC12M_text))
processed_CC12M_text

12423374


['A sign with yellow lights in Germany',
 'Man On The Moon Art Print',
 'Sea Island Wedding, Cloister Chapel, Black Banks Terrace, The Decisive Moment Photography',
 '<PERSON> Style Sculptural Draped Plaster Floor Lamps - a Pair For Sale - Image 9 of 13',
 'Gru the kitten, thrown out of a moving vehicle',
 'Poppy Flower Is One Line Art. Vector abstract contour drawing floral in a Trendy Minimalist Style.',
 'Otter playing at the Lehigh Valley Zoo',
 "National Service Of Thanksgiving To Celebrate The Queen's 90th Birthday : News Photo",
 'Fell off the G Wagon T Shirt Funny On Off the Wagon Short-Sleeve Unisex T-Shirt',
 'Ventura - the avant-garde brand of Swiss watches | News',
 'Cornfield Painting - Passing The Time by <PERSON>',
 'The open space in a Japanese garden can sometimes make people used to Western gardening philosophy feel uneasy. Once they start to understand that the space is as important of an element as any other they can see the value of emptiness. Garden Oasis, Garden 

In [7]:
%%time
print("Number of cpu : ", multiprocessing.cpu_count())

p = Pool(processes=16)
processed_CC12M = p.map(text_preprocessing, [processed_CC12M_text[i] for i in range(len(processed_CC12M_text))])
p.close()

print(f" the length of text processed data is ============================ {len(processed_CC12M)}")
print(processed_CC12M)

Number of cpu :  64


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOStream.flush timed out


In [None]:
len(processed_CC12M_text)

## Processing LAION

In [8]:
path_dir_laion = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/LAION/dataset/laion/laion400m-meta/'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        path_dir_laion.append(os.path.join(dirname, filename))

In [9]:
def collect_laion_data(path):
    df_laion = pd.read_parquet(path)
    N_laion = len(df_laion)  # number of samples from one parquet of laion 
    small_laion = df_laion.sample(n=N_laion)

    laion_texts_array = small_laion["TEXT"].iloc[0:N_laion].values
    return laion_texts_array.tolist()

In [10]:
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())

Number of cpu :  64


In [None]:
## cal feature info 
start_time = time.time()

common_lst = []
for i in range(len(path_dir_laion)):
    laion_texts = collect_laion_data(path_dir_laion[i])
    
    laion_text_re = []
    for val in laion_texts:
        if val != None :
            laion_text_re.append(val)

    p = Pool(processes=16)
    laion_text_processed = p.map(text_preprocessing, [laion_text_re[i] for i in range(len(laion_text_re))])
    p.close()    
    common = set(laion_text_processed).intersection(set(processed_CC12M)) 
    print(f"length of common set {len(common)}")
    print("--- %s seconds ---" % (time.time() - start_time))
    common_lst.append(common)

In [None]:
common_lst_re = []
for common_ in common_lst:
    common_lst_re.extend(common_)

In [None]:
np.save('./CC12M_LAION_commonset.npy',common_lst_re)
np.save('./CC12M_LAION_unqiue_commonset.npy',np.unique(np.array(common_lst_re)))

# LAION vs CC3M

## Processing CC3M

In [10]:
df_cc_3M =  pd.read_csv("/home/myeongseob/clip-privacy/Open_clip_training/src/data/dataset/3M/Train_GCC-training.tsv", sep='\t')
print(f"the lenghth of data is {len(df_cc_3M)}")

N_cc_3M = len(df_cc_3M)
print(df_cc_3M.keys()) 

In [11]:
cc_3M_images_array = df_cc_3M["url"].iloc[0:N_cc_3M].values
cc_3M_texts_array = df_cc_3M["caption"].iloc[0:N_cc_3M].values
ipyplot.plot_images(cc_3M_images_array, cc_3M_texts_array, max_images=5, img_width=200)

In [12]:
%%time
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())

#pool = multiprocessing.Pool(64)
#processes = [pool.apply_async(text_preprocessing, args=(cc_3M_texts_array[i],)) for i in range(len(cc_3M_texts_array))]
p = Pool(processes=16)
processed_cc_3M = p.map(text_preprocessing, [cc_3M_texts_array[i] for i in range(len(cc_3M_texts_array))])
p.close()

--- 64.63366556167603 seconds ---


In [13]:
len(processed_cc_3M)

3318333


['the raincoat worn while kissing the heck out of actor .',
 'the concession centred this is a contemporary construction of astyle hotel atop the original building .',
 'talk show host is seen on a motorcycle outside building',
 'female hands painted easter eggs in a yellow color with a brush , top view',
 'little boy playing in the autumn park , shaking the tree and leaves are falling on him',
 'black sand beaches that stretch for miles upon miles with winds so wild they will literally rip your doors off .',
 'a musician with his saxophone',
 'vector illustration of an abstract complex structure consisting of geometric shapes on a white background .',
 'person paint up with white clay for fun',
 'pets a white pony at the zoo on her 80th birthday',
 'close - up of puppy wearing a wig , in front of white background',
 'marching through the streets protesting the capitalist system under which we live .',
 'red - haired girl with red lips with a black hat with long hair .',
 'view with de

## Processing LAION

In [None]:
def collect_laion_data(path):
    df_laion = pd.read_parquet(path)
    N_laion = len(df_laion)  # number of samples from one parquet of laion 
    small_laion = df_laion.sample(n=N_laion)

    laion_texts_array = small_laion["TEXT"].iloc[0:N_laion].values
    return laion_texts_array.tolist()

In [None]:
## cal feature info 
import time
start_time = time.time()

common_lst = []
for i in range(len(path_dir)):
    laion_texts = collect_laion_data(path_dir[i])
    
    laion_text_re = []
    for val in laion_texts:
        if val != None :
            laion_text_re.append(val)

    p = Pool(processes=16)
    laion_text_processed = p.map(text_preprocessing, [laion_text_re[i] for i in range(len(laion_text_re))])
    p.close()    
    common = set(laion_text_processed).intersection(set(processed_cc_3M)) 
    print(f"length of common set {len(common)}")
    print("--- %s seconds ---" % (time.time() - start_time))
    common_lst.append(common)

In [14]:
common_lst_re = []
for common_ in common_lst:
    common_lst_re.extend(common_)

Number of cpu :  64
CPU times: user 2.56 s, sys: 1.59 s, total: 4.15 s
Wall time: 2min 23s


In [15]:
np.save('./CC3M_LAION_commonset.npy',common_lst_re)
np.save('./CC3M_LAION_unqiue_commonset.npy',np.unique(np.array(common_lst_re)))

['raincoat worn kissing heck actor',
 'concession centred contemporary construction astyle hotel atop original building',
 'talk show host seen motorcycle outside building',
 'female hands painted easter eggs yellow color brush top view',
 'little boy playing autumn park shaking tree leaves falling',
 'black sand beaches stretch miles upon miles winds wild literally rip doors',
 'musician saxophone',
 'vector illustration abstract complex structure consisting geometric shapes white background',
 'person paint white clay fun',
 'pets white pony zoo th birthday',
 'close puppy wearing wig front white background',
 'marching streets protesting capitalist system live',
 'red haired girl red lips black hat long hair',
 'view decorations night',
 'actor also seen event',
 'vector silhouette family white background',
 'soccer player action football match football team',
 'ripe onion white background',
 'loved streets person model opted casual look dressed black hooded top pair short white sho

# LAION vs MSCOCO

## Processing MSCOCO

In [2]:
path_dir = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/Open_clip_training/src/data/mscoco/mscoco/'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if filename.split('.')[-1] == 'parquet':
            path_dir.append(os.path.join(dirname, filename))

In [3]:
def collect_mscoco_data(path):
    df_mscoco = pd.read_parquet(path)
    N_mscoco = len(df_mscoco)  # number of samples from one parquet of laion 
    small_mscoco = df_mscoco.sample(n=N_mscoco)

    mscoco_texts_array = small_mscoco["caption"].iloc[0:N_mscoco].values
    return mscoco_texts_array.tolist()

In [4]:
## cal feature info 
import time
start_time = time.time()

processed_MSCOCO_text = []
for i in range(len(path_dir)):
    mscoco_texts = collect_mscoco_data(path_dir[i])
    
    mscoco_texts_re = []
    for val in mscoco_texts:
        if val != None :
            mscoco_texts_re.append(val)
    processed_MSCOCO_text.extend(mscoco_texts_re)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1.7311735153198242 seconds ---


In [8]:
print(len(processed_MSCOCO_text))
processed_MSCOCO_text

591753


['Three young men playing a game of frisbee.',
 'A kitchen area with a sink, refrigerator and stove.',
 'A bear resting on top of some rocks at a zoo',
 'A couple of dogs fighting over a red frisbee.',
 'A bathroom sink with soap on it and a urinal beside it.',
 'A sunset lit sky with traffic with street light.',
 'Well kept kitchen with marble counter tops and stainless steel fridge',
 'A zebra standing out alone in the grassy field',
 "A baseball player stands in the batter's box awaiting the pitch.",
 'Group of three horses sitting in the middle of the grass.',
 'there is a old truck parked in the grass',
 'A person snowboarding down a snowy hill. ',
 'A group of military officers cutting a cake',
 'A man on a skateboard that is on the sidewalk.',
 'Two goats or sheep are crossing the road.',
 'A man in a beanie sitting with a red and white snowboard.',
 'White bathroom sink with tile countertop and wood cabinet.',
 'A brown tiled bathroom with a large bath and shower.',
 'a clock, 

In [9]:
%%time
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())

#pool = multiprocessing.Pool(64)
#processes = [pool.apply_async(text_preprocessing, args=(cc_3M_texts_array[i],)) for i in range(len(cc_3M_texts_array))]
p = Pool(processes=16)
processed_MSCOCO = p.map(text_preprocessing, [processed_MSCOCO_text[i] for i in range(len(processed_MSCOCO_text))])
p.close()

Number of cpu :  256
CPU times: user 257 ms, sys: 184 ms, total: 441 ms
Wall time: 36 s


In [12]:
print(len(processed_MSCOCO))
processed_MSCOCO

591753


['three young men playing game frisbee',
 'kitchen area sink refrigerator stove',
 'bear resting top rocks zoo',
 'couple dogs fighting red frisbee',
 'bathroom sink soap urinal beside',
 'sunset lit sky traffic street light',
 'well kept kitchen marble counter tops stainless steel fridge',
 'zebra standing alone grassy field',
 'baseball player stands batters box awaiting pitch',
 'group three horses sitting middle grass',
 'old truck parked grass',
 'person snowboarding snowy hill',
 'group military officers cutting cake',
 'man skateboard sidewalk',
 'two goats sheep crossing road',
 'man beanie sitting red white snowboard',
 'white bathroom sink tile countertop wood cabinet',
 'brown tiled bathroom large bath shower',
 'clock fan desk chair room',
 'egg vegetables sits top plate',
 'metal dark green park bench base incline',
 'man home plate swinging bat',
 'plate food looks delicious broccoli onions potatoes',
 'young man leaving water surf board',
 'group friends sitting together

## Processing LAION


In [13]:
path_dir_laion = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/LAION/dataset/laion/laion400m-meta/'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        path_dir_laion.append(os.path.join(dirname, filename))

In [14]:
def collect_laion_data(path):
    df_laion = pd.read_parquet(path)
    N_laion = len(df_laion)  # number of samples from one parquet of laion 
    small_laion = df_laion.sample(n=N_laion)

    laion_texts_array = small_laion["TEXT"].iloc[0:N_laion].values
    return laion_texts_array.tolist()

In [None]:
## cal feature info 
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())
import time
start_time = time.time()

common_lst = []
for i in range(len(path_dir_laion)):
    laion_texts = collect_laion_data(path_dir_laion[i])
    
    laion_text_re = []
    for val in laion_texts:
        if val != None :
            laion_text_re.append(val)

    p = Pool(processes=16)
    laion_text_processed = p.map(text_preprocessing, [laion_text_re[i] for i in range(len(laion_text_re))])
    p.close()    
    common = set(laion_text_processed).intersection(set(processed_MSCOCO)) 
    print(f"length of common set {len(common)}")
    print("--- %s seconds ---" % (time.time() - start_time))
    common_lst.append(common)

Number of cpu :  256
length of common set 242
--- 845.5874762535095 seconds ---
length of common set 248
--- 1715.2872474193573 seconds ---
length of common set 220
--- 2575.1364512443542 seconds ---
length of common set 253
--- 3455.853912830353 seconds ---
length of common set 261
--- 4331.457944869995 seconds ---
length of common set 235
--- 5181.130166769028 seconds ---
length of common set 251
--- 6033.268998146057 seconds ---
length of common set 232
--- 7020.787367343903 seconds ---
length of common set 247
--- 8223.802123308182 seconds ---
length of common set 253
--- 9301.142758131027 seconds ---
length of common set 253
--- 10168.278317451477 seconds ---
length of common set 239
--- 11032.144304990768 seconds ---
length of common set 230
--- 11901.713087320328 seconds ---
length of common set 253
--- 12756.369405031204 seconds ---
length of common set 254
--- 13542.899965524673 seconds ---
length of common set 235
--- 14354.327021598816 seconds ---
length of common set 265
--

In [None]:
common_lst_re = []
for common_ in common_lst:
    common_lst_re.extend(common_)

In [None]:
np.save('./MSCOCO_LAION_commonset.npy',common_lst_re)
np.save('./MSCOCO_LAION_unqiue_commonset.npy',np.unique(np.array(common_lst_re)))

# LAION vs SBU Captions

## Processing SBU Captions

In [26]:
path_dir = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/Open_clip_training/src/data/sbucaptions/'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        if filename.split('.')[-1] == 'parquet':
            path_dir.append(os.path.join(dirname, filename))

In [28]:
def collect_sbu_data(path):
    df_sbu = pd.read_parquet(path)
    N_sub = len(df_sbu)  # number of samples from one parquet of laion 
    small_sbu = df_sbu.sample(n=N_sub)

    sbu_texts_array = small_sbu["caption"].iloc[0:N_sub].values
    return sbu_texts_array.tolist()

In [None]:
## cal feature info 
import time
start_time = time.time()

processed_SBU_text = []
for i in range(len(path_dir)):
    sbu_texts = collect_sbu_data(path_dir[i])
    
    sbu_texts_re = []
    for val in sbu_texts:
        if val != None :
            sbu_texts_re.append(val)
    processed_SBU_text.extend(sbu_texts_re)
print("--- %s seconds ---" % (time.time() - start_time))

print(f" the length of processed data is ============================ {len(processed_SBU_text)}")

In [None]:
%%time
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())

p = Pool(processes=16)
processed_SBU = p.map(text_preprocessing, [processed_SBU_text[i] for i in range(len(processed_SBU_text))])
p.close()

print(f" the length of text processed data is ============================ {len(processed_SBU)}")

## Processing LAION

In [None]:
path_dir_laion = []
for dirname, _, filenames in os.walk('/home/myeongseob/clip-privacy/LAION/dataset/laion/laion400m-meta/'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        path_dir_laion.append(os.path.join(dirname, filename))

In [None]:
def collect_laion_data(path):
    df_laion = pd.read_parquet(path)
    N_laion = len(df_laion)  # number of samples from one parquet of laion 
    small_laion = df_laion.sample(n=N_laion)

    laion_texts_array = small_laion["TEXT"].iloc[0:N_laion].values
    return laion_texts_array.tolist()

In [None]:
## cal feature info 
import multiprocessing
from multiprocessing import Pool
print("Number of cpu : ", multiprocessing.cpu_count())
import time
start_time = time.time()

common_lst = []
for i in range(len(path_dir_laion)):
    laion_texts = collect_laion_data(path_dir_laion[i])
    
    laion_text_re = []
    for val in laion_texts:
        if val != None :
            laion_text_re.append(val)

    p = Pool(processes=16)
    laion_text_processed = p.map(text_preprocessing, [laion_text_re[i] for i in range(len(laion_text_re))])
    p.close()    
    common = set(laion_text_processed).intersection(set(processed_SBU)) 
    print(f"length of common set {len(common)}")
    print("--- %s seconds ---" % (time.time() - start_time))
    common_lst.append(common)

In [None]:
common_lst_re = []
for common_ in common_lst:
    common_lst_re.extend(common_)

In [None]:
np.save('./SBU_LAION_commonset.npy',common_lst_re)
np.save('./SBU_LAION_unqiue_commonset.npy',np.unique(np.array(common_lst_re)))