In [1]:
import pandas as pd

In [2]:
path = "output.txt"
path_save = "extracted.csv"

# Read TXT File

In [3]:
df = pd.read_csv(path)

In [4]:
df.head(10)

Unnamed: 0,nohup: ignoring input
0,tput: No value for $TERM and no -T specified
1,DataLoaderRaw loading images from folder: \t./...
2,listing all images in directory ./cv/images\t
3,DataLoaderRaw found 37547 images\t
4,constructing clones inside the LanguageModel\t
5,"cp ""./cv/images/035669.jpg"" vis/imgs/img1.jpg\t"
6,image 1: a large clock tower towering over a c...
7,evaluating performance... 1/13000 (0.000000)\t
8,"cp ""./cv/images/036736.jpg"" vis/imgs/img2.jpg\t"
9,image 2: a woman is holding a cell phone in he...


In [5]:
df.tail(10)

Unnamed: 0,nohup: ignoring input
38996,"cp ""./cv/images/014450.jpg"" vis/imgs/img12998...."
38997,image 12998: a large clock tower towering over...
38998,evaluating performance... 12998/13000 (0.00000...
38999,"cp ""./cv/images/014080.jpg"" vis/imgs/img12999...."
39000,image 12999: a person holding a flower in a va...
39001,evaluating performance... 12999/13000 (0.00000...
39002,"cp ""./cv/images/014001.jpg"" vis/imgs/img13000...."
39003,image 13000: a person is sitting in front of a...
39004,evaluating performance... 13000/13000 (0.00000...
39005,loss: \tnan\t


# Remove redundant starting lines

In [6]:
def trim_lines(df, x=5):
    """ Removes first {x} rows from dataframe and the last row.
    """
    df = df.iloc[x:-1]
    return df

In [7]:
df = trim_lines(df)

In [8]:
df.head(5)

Unnamed: 0,nohup: ignoring input
5,"cp ""./cv/images/035669.jpg"" vis/imgs/img1.jpg\t"
6,image 1: a large clock tower towering over a c...
7,evaluating performance... 1/13000 (0.000000)\t
8,"cp ""./cv/images/036736.jpg"" vis/imgs/img2.jpg\t"
9,image 2: a woman is holding a cell phone in he...


In [9]:
df.tail(5)

Unnamed: 0,nohup: ignoring input
39000,image 12999: a person holding a flower in a va...
39001,evaluating performance... 12999/13000 (0.00000...
39002,"cp ""./cv/images/014001.jpg"" vis/imgs/img13000...."
39003,image 13000: a person is sitting in front of a...
39004,evaluating performance... 13000/13000 (0.00000...


### Adding column name

In [10]:
df.columns = ["original"]

# Removing evaluation

In [11]:
def label_evaluation(x):
    """ Needed for apply function
    """
    if "evaluating performance... " in x:
        return "yes"
    else:
        return "no"

In [12]:
def filter_evaluation(df):
    """ Filters evaluation lines
    """
    # label
    df["eval"] = df.original.apply(lambda x: label_evaluation(x))
    # filter
    df = df[df["eval"] == "no"]
    df = df.drop(["eval"], axis=1)
    return df

In [13]:
df = filter_evaluation(df)

In [14]:
df.head()

Unnamed: 0,original
5,"cp ""./cv/images/035669.jpg"" vis/imgs/img1.jpg\t"
6,image 1: a large clock tower towering over a c...
8,"cp ""./cv/images/036736.jpg"" vis/imgs/img2.jpg\t"
9,image 2: a woman is holding a cell phone in he...
11,"cp ""./cv/images/021069.jpg"" vis/imgs/img3.jpg\t"


# Extracting patahs, img names and caption

In [15]:
def extract_data(df):
    """ Extracts two columns for paths where images are located, image names, and caption for each image
    """
    paths = list(df.iloc[::2, :].values)
    paths = [path[0] for path in paths]
    images = list(df.iloc[1:].iloc[::2, :].values)
    images = [img[0] for img in images]
    
    df = pd.DataFrame({
    "paths": paths,
    "images": images
    })
    
    df["caption"] = df.images.apply(lambda x: x.split(": ")[1].replace("\t",""))
    df["path1"] = df.paths.apply(lambda x: x.split(" ")[1])
    df["path2"] = df.paths.apply(lambda x: x.split(" ")[2])
    df["name1"] = df.path1.apply(lambda x: x.split("/")[-1].strip('"'))
    df["name2"] = df.path2.apply(lambda x: x.split("/")[-1].strip('\t"'))
    
    return df

In [16]:
df = extract_data(df)
df.head()

Unnamed: 0,images,paths,caption,path1,path2,name1,name2
0,image 1: a large clock tower towering over a c...,"cp ""./cv/images/035669.jpg"" vis/imgs/img1.jpg\t",a large clock tower towering over a city,"""./cv/images/035669.jpg""",vis/imgs/img1.jpg\t,035669.jpg,img1.jpg
1,image 2: a woman is holding a cell phone in he...,"cp ""./cv/images/036736.jpg"" vis/imgs/img2.jpg\t",a woman is holding a cell phone in her hand,"""./cv/images/036736.jpg""",vis/imgs/img2.jpg\t,036736.jpg,img2.jpg
2,image 3: a herd of animals grazing on a lush g...,"cp ""./cv/images/021069.jpg"" vis/imgs/img3.jpg\t",a herd of animals grazing on a lush green hill...,"""./cv/images/021069.jpg""",vis/imgs/img3.jpg\t,021069.jpg,img3.jpg
3,image 4: a sign that says UNK UNK on the side ...,"cp ""./cv/images/003021.jpg"" vis/imgs/img4.jpg\t",a sign that says UNK UNK on the side of it,"""./cv/images/003021.jpg""",vis/imgs/img4.jpg\t,003021.jpg,img4.jpg
4,image 5: a couple of people that are in a field\t,"cp ""./cv/images/006736.jpg"" vis/imgs/img5.jpg\t",a couple of people that are in a field,"""./cv/images/006736.jpg""",vis/imgs/img5.jpg\t,006736.jpg,img5.jpg


In [17]:
def save(df, save_path):
    df.to_csv(save_path, encoding="utf-8")
    print("File saved at: ", save_path)

In [18]:
save(df, path_save)

File saved at:  extracted.csv
