# Path Splitter

This notebook splits file paths by their last forward slash and creates a DataFrame for analysis.


In [7]:
import pandas as pd
import os


In [8]:
# Read the file
file_path = 'backblaze_file_targets.csv'

with open(file_path, 'r') as f:
    lines = f.readlines()

# Show first few lines
print("First 5 lines of the file:")
for line in lines[:5]:
    print(line.strip())


First 5 lines of the file:
/Users/andreaalejandrino/Documents/Zoom/2020-12-25 16.29.37 Andrea Alejandrino's Personal Meeting Room 8081359430\tplayback.m3u
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/CertificateRevocation/10091/manifest.json
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default/Cookies-journal
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default/File System/142/p/.usage
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default/Service Worker/CacheStorage/8cbb992fe0cd9ef960e69a214646bd270516a23e/1a5d3f39-c61c-4f03-becc-7848b209d6a3/index


In [9]:
# Process each line
processed_lines = []

for line in lines:
    # Skip empty lines
    if not line.strip():
        continue
        
    # Split by line number if present
    if '|' in line:
        line_num, content = line.strip().split('|', 1)
    else:
        line_num = ''
        content = line.strip()
    
    # Split by last forward slash
    if '/' in content:
        path, filename = content.rsplit('/', 1)
    else:
        path = ''
        filename = content
        
    processed_lines.append({
        'line_number': line_num.strip(),
        'full_path': content,
        'directory': path,
        'filename': filename
    })

# Create DataFrame
df = pd.DataFrame(processed_lines)
print("\nFirst few rows of processed data:")
df.head()



First few rows of processed data:


Unnamed: 0,line_number,full_path,directory,filename
0,,/Users/andreaalejandrino/Documents/Zoom/2020-1...,/Users/andreaalejandrino/Documents/Zoom,2020-12-25 16.29.37 Andrea Alejandrino's Perso...
1,,/Users/andreaalejandrino/Library/Application S...,/Users/andreaalejandrino/Library/Application S...,manifest.json
2,,/Users/andreaalejandrino/Library/Application S...,/Users/andreaalejandrino/Library/Application S...,Cookies-journal
3,,/Users/andreaalejandrino/Library/Application S...,/Users/andreaalejandrino/Library/Application S...,.usage
4,,/Users/andreaalejandrino/Library/Application S...,/Users/andreaalejandrino/Library/Application S...,index


In [10]:
# Basic statistics
print(f"Total number of files: {len(df)}")
print(f"\nNumber of unique directories: {df['directory'].nunique()}")
print(f"\nMost common file extensions:")
df['extension'] = df['filename'].str.extract(r'(\.[^.]+)$')
print(df['extension'].value_counts().head())


Total number of files: 341

Number of unique directories: 189

Most common file extensions:
extension
.json    43
.txt     32
.url     31
.bat     25
.dat     15
Name: count, dtype: int64


In [11]:
# Create tab-separated version
tab_separated_lines = []
for _, row in df.iterrows():
    if row['line_number']:
        line = f"{row['line_number']}|{row['directory']}\t{row['filename']}"
    else:
        line = f"{row['directory']}\t{row['filename']}"
    tab_separated_lines.append(line)

# Save to new file
output_file = 'tab_separated_paths_20251014.csv'
with open(output_file, 'w') as f:
    f.write('\n'.join(tab_separated_lines))
print(f"Saved tab-separated version to {output_file}")

# Show first few lines of the output
print("\nFirst 5 lines of the tab-separated file:")
for line in tab_separated_lines[:5]:
    print(line)


Saved tab-separated version to tab_separated_paths_20251014.csv

First 5 lines of the tab-separated file:
/Users/andreaalejandrino/Documents/Zoom	2020-12-25 16.29.37 Andrea Alejandrino's Personal Meeting Room 8081359430\tplayback.m3u
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/CertificateRevocation/10091	manifest.json
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default	Cookies-journal
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default/File System/142/p	.usage
/Users/andreaalejandrino/Library/Application Support/Google/Chrome/Default/Service Worker/CacheStorage/8cbb992fe0cd9ef960e69a214646bd270516a23e/1a5d3f39-c61c-4f03-becc-7848b209d6a3	index
