# Data creation

This notebook connects to Omero and with a plate number given will a CSV file with data from the cells that we need.

In [59]:
#Connect to Omero

import ezomero
import numpy as np

HOST = 'ome2.hpc.sussex.ac.uk'
user_name = 'rz200'
password = 'omeroreset'
port = 4064
conn = ezomero.connect(user=user_name,password=password,group='',host=HOST,port=port,secure=True)
if conn: print('Connection successful')
else: print('Unsuccessful')

Connection successful


In [60]:
#Get the image IDs from the specified plate
plate = 821 #change for the plate number you want

image_ids = ezomero.get_image_ids(conn,plate=plate)
print('In plate',plate,'we have',len(image_ids),'images')

In plate 821 we have 540 images


In [61]:
#Create the folders where to store the images/data/CSVs
import os
from datetime import datetime

timestamp_folder_name = 'omero_data_' + user_name + '_' + datetime.now().strftime("%Y%m%d%H%M%S") #get timestamp
os.mkdir(timestamp_folder_name) #create folder with timestamp as name

data_folder_name = timestamp_folder_name + '/data_' + str(plate)
os.mkdir(data_folder_name) #create folder with plate number in timestamp folder

In [62]:
#Progress bar
#Will be moved to a class at some point

import sys
#source:https://stackoverflow.com/questions/3160699/python-progress-bar
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.3+
    # I would love to add a 'time left' addition to this
    count = len(it)
    def show(j):
        x = int(size*j/count)
        print("{}[{}{}] {}/{}".format(prefix, "#"*x, "."*(size-x), j, count),
              end='\r', file=out, flush=True)
    show(0)
    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)

In [63]:
#Get all the images from the plate

plate_images = []
for i in progressbar(range(len(image_ids))):
    plate_images.append(ezomero.get_image(conn,image_id=image_ids[i]))
plate_images = np.array(plate_images,dtype=object)

[############################################################] 540/540



In [70]:
#We don't want to get all our images at once as that would take up too much memory
#The divider is the value representing how many groups of data we are making from our cell images
#If you do not want to separate your data at all (big memory flex): divider=1
#The divider has to divide your data to a round number, otherwise we would miss some data
#If no error is printed, carry on

divider = 180

class Error(Exception):
    """Base class for other exceptions"""
    pass

class NotRound(Error):
    """Raised when the divider leads to a non round value"""
    pass

try:
    is_round = (len(plate_images)/divider % 1 == 0)
    if is_round is not True:
        raise NotRound
except NotRound:
    print("The divider you chose led to a non round value")
    print("You would miss out on some data this way, try a different divider")

In [73]:
#Collect cell data from the images
from ccc_functions import build_one_cell_df
import pandas as pd
from pathlib import Path

#This will be a loop that goes 100 images by 100
model_dir = "C:\\Users\\rz200\\Documents\\datadevelopment\\cell-SCT\\segmentation\\demo_train_test_data\\data_for_masks_training\\models\\demo_model"
#for i in range(round(len(plate_images)/divider)):
for i in range(0,len(plate_images),int(len(plate_images)/divider)):
    print(str(i) + '/' + str(int(len(plate_images)/divider)))
    pd_list = []
    for j in range(i,int(i+len(plate_images)/divider)):
        #print(j)
        pd_list.append(build_one_cell_df(plate_images[j], model_dir))
    #print(i)
    df_concat = pd.concat(pd_list)
    df_concat = df_concat.reset_index()
    #print(i)
    filepath = Path(data_folder_name + '\\CSV_' + str(plate) + '_' + str(i))
    filepath.parent.mkdir(parents=True, exist_ok=True)
    df_concat.to_csv(filepath)
    #print(i)

#It builds a CSV with those 50 images
#Then it stores it in the plate folder

0/3
3/3
6/3
9/3
12/3
15/3
18/3
21/3
24/3
27/3
30/3
33/3
36/3
39/3
42/3
45/3
48/3
51/3
54/3
57/3
60/3
63/3
66/3
69/3
72/3
75/3
78/3
81/3
84/3
87/3
90/3
93/3
96/3
99/3
102/3
105/3
108/3
111/3
114/3
117/3
120/3
123/3
126/3
129/3
132/3
135/3
138/3
141/3
144/3
147/3
150/3
153/3
156/3
159/3
162/3
165/3
168/3
171/3
174/3
177/3
180/3
183/3
186/3
189/3
192/3
195/3
198/3
201/3
204/3
207/3
210/3
213/3
216/3
219/3
222/3
225/3
228/3
231/3
234/3
237/3
240/3
243/3
246/3
249/3
252/3
255/3
258/3
261/3
264/3
267/3
270/3
273/3
276/3
279/3
282/3
285/3
288/3
291/3
294/3
297/3
300/3
303/3
306/3
309/3
312/3
315/3
318/3
321/3
324/3
327/3
330/3
333/3
336/3
339/3
342/3
345/3
348/3
351/3
354/3
357/3
360/3
363/3
366/3
369/3
372/3
375/3
378/3
381/3
384/3
387/3
390/3
393/3
396/3
399/3
402/3
405/3
408/3
411/3
414/3
417/3
420/3
423/3
426/3
429/3
432/3
435/3
438/3
441/3
444/3
447/3
450/3
453/3
456/3
459/3
462/3
465/3
468/3
471/3
474/3
477/3
480/3
483/3
486/3
489/3
492/3
495/3
498/3
501/3
504/3
507/3
510/3
513/3
516/3


In [74]:
#First find all the files and put them in a list

path_to_data = os.getcwd() + '\\' + timestamp_folder_name + '\\' + 'data_' + str(plate)
all_CSV_paths = []
for file in os.listdir(path_to_data):
    all_CSV_paths.append(path_to_data + '\\' + file)

all_df = []
for csv_path in all_CSV_paths:
    all_df.append(pd.read_csv(csv_path))

In [75]:
#Second concatenate them all into one dataframe

df_concat = pd.concat(all_df)
df_concat = df_concat.reset_index()

from pathlib import Path
filepath = Path(path_to_data + '\\' + 'dataframe_' + str(plate))
filepath.parent.mkdir(parents=True, exist_ok=True)
df_concat.to_csv(filepath)

In [76]:
 #Third delete all the individual CSVs, not the concatenated one
#Don't run this until we're sure
for path in all_CSV_paths:
    os.remove(path)