In [1]:
import pandas as pd

In [2]:
path = "karpathy_output.txt"
path_save = "final_out.csv"

# Read TXT File

In [3]:
df = pd.read_csv(path)

In [4]:
df.head(10)

Unnamed: 0,nohup: ignoring input
0,tput: No value for $TERM and no -T specified
1,DataLoaderRaw loading images from folder: \t./...
2,listing all images in directory ./images/\t
3,DataLoaderRaw found 37545 images\t
4,constructing clones inside the LanguageModel\t
5,./images/029599.jpg\t
6,"cp ""./images/029599.jpg"" vis/imgs/img1.jpg\t"
7,image 1: a view of a mountain range from the g...
8,evaluating performance... 1/-1 (0.000000)\t
9,./images/013369.jpg\t


In [5]:
df.tail(10)

Unnamed: 0,nohup: ignoring input
150176,evaluating performance... 37543/-1 (0.000000)\t
150177,./images/019838.jpg\t
150178,"cp ""./images/019838.jpg"" vis/imgs/img37544.jpg\t"
150179,image 37544: a man flying through the air whil...
150180,evaluating performance... 37544/-1 (0.000000)\t
150181,./images/027636.jpg\t
150182,"cp ""./images/027636.jpg"" vis/imgs/img37545.jpg\t"
150183,image 37545: a close up of a teddy bear on a r...
150184,evaluating performance... 0/-1 (0.000000)\t
150185,loss: \tnan\t


# Remove redundant starting lines

In [6]:
def trim_lines(df, x=5):
    """ Removes first {x} rows from dataframe and the last row.
    """
    df = df.iloc[x:-1]
    return df

In [7]:
df = trim_lines(df)

In [8]:
df.head(5)

Unnamed: 0,nohup: ignoring input
5,./images/029599.jpg\t
6,"cp ""./images/029599.jpg"" vis/imgs/img1.jpg\t"
7,image 1: a view of a mountain range from the g...
8,evaluating performance... 1/-1 (0.000000)\t
9,./images/013369.jpg\t


In [9]:
df.tail(5)

Unnamed: 0,nohup: ignoring input
150180,evaluating performance... 37544/-1 (0.000000)\t
150181,./images/027636.jpg\t
150182,"cp ""./images/027636.jpg"" vis/imgs/img37545.jpg\t"
150183,image 37545: a close up of a teddy bear on a r...
150184,evaluating performance... 0/-1 (0.000000)\t


### Adding column name

In [10]:
df.columns = ["original"]

# Removing evaluation

In [11]:
def label_evaluation(x):
    """ Needed for apply function
    """
    if "evaluating performance... " in x:
        return "yes"
    else:
        return "no"

In [12]:
def filter_evaluation(df):
    """ Filters evaluation lines
    """
    # label
    df["eval"] = df.original.apply(lambda x: label_evaluation(x))
    # filter
    df = df[df["eval"] == "no"]
    df = df.drop(["eval"], axis=1)
    return df

In [13]:
df = filter_evaluation(df)

In [14]:
# df = df.reset_index(drop=True)
# df.head(20)

# Extracting patahs, img names and caption

In [15]:
def extract_data(df):
    """ Extracts two columns for paths where images are located, image names, and caption for each image
    """
    df = df.reset_index(drop=True)
    df.head(20)
    
    paths = list(df.iloc[1::3, :].values)
    paths = [path[0] for path in paths]
    images = list(df.iloc[2::3, :].values)
    images = [img[0] for img in images]
    
#     print("IMGS:",len(images))
#     print("PATHS:",len(paths))
#     print(paths[1])
    
    df = pd.DataFrame({
    "paths": paths,
    "images": images
    })
    
    df["caption"] = df.images.apply(lambda x: x.split(": ")[1].replace("\t",""))
    df["path1"] = df.paths.apply(lambda x: x.split(" ")[1])
    df["path2"] = df.paths.apply(lambda x: x.split(" ")[2])
    df["name1"] = df.path1.apply(lambda x: x.split("/")[-1].strip('"'))
    df["name2"] = df.path2.apply(lambda x: x.split("/")[-1].strip('\t"'))
    
    return df

In [16]:
df = extract_data(df)
df.head()

IMGS: 37545
PATHS: 37545
cp "./images/013369.jpg" vis/imgs/img2.jpg	


Unnamed: 0,images,paths,caption,path1,path2,name1,name2
0,image 1: a view of a mountain range from the g...,"cp ""./images/029599.jpg"" vis/imgs/img1.jpg\t",a view of a mountain range from the ground,"""./images/029599.jpg""",vis/imgs/img1.jpg\t,029599.jpg,img1.jpg
1,image 2: a person on a boat in the water\t,"cp ""./images/013369.jpg"" vis/imgs/img2.jpg\t",a person on a boat in the water,"""./images/013369.jpg""",vis/imgs/img2.jpg\t,013369.jpg,img2.jpg
2,image 3: a view of a building with a clock tow...,"cp ""./images/023620.jpg"" vis/imgs/img3.jpg\t",a view of a building with a clock tower,"""./images/023620.jpg""",vis/imgs/img3.jpg\t,023620.jpg,img3.jpg
3,image 4: a street sign on the side of a road\t,"cp ""./images/004139.jpg"" vis/imgs/img4.jpg\t",a street sign on the side of a road,"""./images/004139.jpg""",vis/imgs/img4.jpg\t,004139.jpg,img4.jpg
4,image 5: a man in a suit and tie standing in a...,"cp ""./images/036728.jpg"" vis/imgs/img5.jpg\t",a man in a suit and tie standing in a parking lot,"""./images/036728.jpg""",vis/imgs/img5.jpg\t,036728.jpg,img5.jpg


In [17]:
def save(df, save_path):
    df.to_csv(save_path, encoding="utf-8")
    print("File saved at: ", save_path)

In [18]:
save(df, path_save)

File saved at:  final_out.csv
