# Loading .out files

In [60]:
import os
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## for maxquant

In [25]:
def extract_patient_id_mq(file_path):
    with open(file_path, 'r') as file:
        first_line = file.readline()
        parts = first_line.split('/')
        return parts[7] if len(parts) > 8 else None

### >> part 2 from patients 550 - 931 with job array 1-87

In [58]:
directory = '/cmnfs/data/proteomics/metaproteomics/mdpi_proteomes_2019_PXD011515' 
files = [file for file in os.listdir(directory) if file.startswith('part2') and file.endswith('.out')]

mq = {'.out file': [], 'patient ID': []}

for file in files:
    patient_id = extract_patient_id_mq(os.path.join(directory, file))
    mq['.out file'].append(file)
    mq['patient ID'].append(patient_id)

df_mq2 = pd.DataFrame(mq)
df_mq2

Unnamed: 0,.out file,patient ID
0,part2_ja_452521_76.out,874.0
1,part2_ja_452521_77.out,876.0
2,part2_ja_452521_78.out,877.0
3,part2_ja_452521_79.out,887.0
4,part2_ja_452521_80.out,888.0
5,part2_ja_452521_81.out,891.0
6,part2_ja_452521_82.out,895.0
7,part2_ja_452521_83.out,913.0
8,part2_ja_452521_84.out,915.0
9,part2_ja_452521_85.out,923.0


In [59]:
# Extract job id and task id from the file names
df_mq2[['job_id', 'task_id']] = df_mq2['.out file'].str.extract(r'part2_ja_(\d+)_(\d+)\.out').astype(int)

# Sort the DataFrame based on job id and task id
df_mq2_sorted = df_mq2.sort_values(by=['job_id', 'task_id'])
df_mq2_sorted

Unnamed: 0,.out file,patient ID,job_id,task_id
68,part2_ja_452521_1.out,558.0,452521,1
76,part2_ja_452521_2.out,564.0,452521,2
70,part2_ja_452521_3.out,565.0,452521,3
69,part2_ja_452521_4.out,566.0,452521,4
75,part2_ja_452521_5.out,571.0,452521,5
67,part2_ja_452521_6.out,576.0,452521,6
72,part2_ja_452521_7.out,578.0,452521,7
71,part2_ja_452521_8.out,582.0,452521,8
73,part2_ja_452521_9.out,587.0,452521,9
78,part2_ja_452521_10.out,588.0,452521,10


### >> part 1 from patients 70 - 548 with job array 1-119

In [41]:
directory = '/cmnfs/data/proteomics/metaproteomics/mdpi_proteomes_2019_PXD011515' 
files = [file for file in os.listdir(directory) if file.startswith('part1') and file.endswith('.out')]

mq = {'.out file': [], 'patient ID': []}

for file in files:
    patient_id = extract_patient_id_mq(os.path.join(directory, file))
    mq['.out file'].append(file)
    mq['patient ID'].append(patient_id)

df_mq = pd.DataFrame(mq)
df_mq.head(25)

Unnamed: 0,.out file,patient ID
0,part1_ja_462807_0.out,70.0
1,part1_ja_462807_1.out,550.0
2,part1_ja_462809_0.out,70.0
3,part1_ja_462809_1.out,550.0
4,part1_ja_462811_0.out,70.0
5,part1_ja_462811_1.out,550.0
6,part1_ja_463007_1.out,550.0
7,part1_ja_463007_0.out,70.0
8,part1_ja_461657_116.out,542.0
9,part1_ja_461657_117.out,546.0


In [45]:
# Extract job id and task id from the file names
df_mq[['job_id', 'task_id']] = df_mq['.out file'].str.extract(r'part1_ja_(\d+)_(\d+)\.out').astype(int)

# Sort the DataFrame based on job id and task id
df_mq_sorted = df_mq.sort_values(by=['job_id', 'task_id'])
df_mq_sorted

Unnamed: 0,.out file,patient ID,job_id,task_id
119,part1_ja_461657_1.out,72.0,461657,1
105,part1_ja_461657_2.out,77.0,461657,2
125,part1_ja_461657_3.out,83.0,461657,3
122,part1_ja_461657_4.out,92.0,461657,4
120,part1_ja_461657_5.out,93.0,461657,5
121,part1_ja_461657_6.out,103.0,461657,6
106,part1_ja_461657_7.out,112.0,461657,7
124,part1_ja_461657_8.out,120.0,461657,8
126,part1_ja_461657_9.out,124.0,461657,9
117,part1_ja_461657_10.out,130.0,461657,10


## for oktoberfest

In [63]:
def extract_patient_id(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            if "oktoberfest.runner::_preprocess Converting search results from" in line:
                parts = line.split('/')
                return parts[7]  # Assuming the patient ID is always at the 7th index

In [79]:
directory = '/cmnfs/data/proteomics/metaproteomics/mdpi_proteomes_2019_PXD011515' 
files = [file for file in os.listdir(directory) if file.startswith('oktoberfest') and file.endswith('.out')]

data = {'.out file': [], 'patient ID': []}

for file in files:
    patient_id = extract_patient_id(os.path.join(directory, file))
    data['.out file'].append(file)
    data['patient ID'].append(patient_id)

df_ok = pd.DataFrame(data)
df_ok

Unnamed: 0,.out file,patient ID
0,oktoberfest_462047_120.out,540
1,oktoberfest_462047_121.out,542
2,oktoberfest_462047_122.out,546
3,oktoberfest_462047_123.out,548
4,oktoberfest_462047_124.out,558
5,oktoberfest_462047_125.out,564
6,oktoberfest_462047_126.out,565
7,oktoberfest_462047_127.out,566
8,oktoberfest_462047_128.out,571
9,oktoberfest_462047_129.out,576


In [81]:
# Extract job id and task id from the file names
df_ok[['job_id', 'task_id']] = df_ok['.out file'].str.extract(r'oktoberfest_(\d+)_(\d+)\.out')

# Replace 'None' with 0 and NaN with 00
df_ok['patient ID'].fillna('00', inplace=True)
df_ok['patient ID'].replace({None: 0}, inplace=True)

# Convert extracted columns to integers
df_ok[['job_id', 'task_id']] = df_ok[['job_id', 'task_id']].astype(float).astype(pd.Int32Dtype())

# Sort the DataFrame based on job id and task id
df_ok_sorted = df_ok.sort_values(by=['job_id', 'task_id'])

# Display the sorted DataFrame
print(df_ok_sorted)

                      .out file patient ID  job_id  task_id
258    oktoberfest_461903_1.out          9  461903        1
250    oktoberfest_461903_2.out          9  461903        2
252    oktoberfest_461908_1.out          9  461908        1
257    oktoberfest_461908_2.out         00  461908        2
249    oktoberfest_461915_1.out         11  461915        1
253    oktoberfest_461915_2.out         11  461915        2
254    oktoberfest_461915_3.out         42  461915        3
255    oktoberfest_461915_4.out         49  461915        4
256    oktoberfest_461915_5.out         70  461915        5
251    oktoberfest_461915_6.out   combined  461915        6
238    oktoberfest_462026_0.out         00  462026        0
248    oktoberfest_462026_1.out         00  462026        1
245    oktoberfest_462026_2.out         00  462026        2
234    oktoberfest_462030_0.out         00  462030        0
246    oktoberfest_462030_1.out         00  462030        1
231    oktoberfest_462030_2.out         

### Check for any duplicated output

In [21]:
duplicate_patient_ids = df[df.duplicated(subset='patient ID', keep=False)]

if not duplicate_patient_ids.empty:
    print("Rows with duplicate patient IDs:")
    print(duplicate_patient_ids)
else:
    print("No duplicate patient IDs found.")

Rows with duplicate patient IDs:
                     .out file patient ID
90    oktoberfest_463076_3.out       None
91    oktoberfest_463076_2.out       None
92    oktoberfest_463076_4.out       None
93    oktoberfest_463076_0.out       None
94    oktoberfest_463076_1.out       None
95    oktoberfest_463096_4.out        550
96    oktoberfest_463096_0.out        550
97    oktoberfest_463096_2.out        550
98    oktoberfest_463096_3.out        550
99    oktoberfest_463096_1.out        550
100   oktoberfest_463102_0.out          9
101   oktoberfest_463102_1.out          9
102   oktoberfest_463104_0.out          4
103   oktoberfest_463105_0.out         70
104   oktoberfest_463106_0.out        295
105   oktoberfest_463178_0.out       None
106   oktoberfest_463186_0.out       None
107   oktoberfest_463187_0.out        295
108   oktoberfest_463189_1.out   combined
109   oktoberfest_463193_0.out          4
110   oktoberfest_463195_0.out         70
199  oktoberfest_462047_46.out        299
2