In [None]:
import pandas as pd

# Define the chunk size
chunk_size = 10000

# Load the smaller dataframe into memory (df_rib)
df_rib = pd.read_csv('RIB.csv')

# Strip whitespace from column names
df_rib.columns = df_rib.columns.str.strip()

# Check if 'Lab_ID' and 'Results' columns exist and strip whitespace from these columns
df_rib['Lab_ID'] = df_rib['Lab_ID'].astype(str).str.strip()
df_rib['Results'] = df_rib['Results'].astype(str).str.strip()

# Create a dictionary for quick lookup from df_rib
results_dict = df_rib.set_index('Lab_ID')['Results'].to_dict()

# Print the first few items from the dictionary to validate
print("Sample of results_dict:", list(results_dict.items())[:5])

# Function to process each chunk
def process_chunk(chunk):
    # Strip whitespace from column names in the chunk
    chunk.columns = chunk.columns.str.strip()

    # Strip whitespace from 'Sample_ID' column values
    chunk['Sample_ID'] = chunk['Sample_ID'].astype(str).str.strip()

    # Create the PCR_Results column using the lookup dictionary
    chunk['PCR_Results'] = chunk['Sample_ID'].map(results_dict)
    
    # Debug: Print first few rows of chunk after processing
    print("Processed chunk sample:")
    print(chunk.head())

    return chunk

# Process df_pool in chunks and write the output to a new CSV file
output_file = 'Pooled_with_Results.csv'
with pd.read_csv('Pool.csv', chunksize=chunk_size) as reader:
    for i, chunk in enumerate(reader):
        # Debug: Print first few rows of chunk before processing
        print(f"Chunk {i} sample before processing:")
        print(chunk.head())

        # Process the chunk
        chunk = process_chunk(chunk)

        # Append the chunk to the output file
        if i == 0:
            chunk.to_csv(output_file, index=False, mode='w')
        else:
            chunk.to_csv(output_file, index=False, mode='a', header=False)

print(f'Processing complete. The result is saved to {output_file}')


In [4]:
import pandas as pd

# Load the CSV file
df_pool = pd.read_csv('Pool.csv', low_memory=False)
df_rib = pd.read_csv('RIB.csv', low_memory=False)

PCR SL           float64
Ext. Batch SL    float64
Pool Set          object
Sample_ID         object
Batch No          object
RSV              float64
FluA             float64
FluB             float64
Result            object
dtype: object


In [5]:
display(df_pool)
print(df_pool.dtypes)
display(df_rib)
print(df_rib.dtypes)

Unnamed: 0,PCR SL,Ext. Batch SL,Pool Set,Sample_ID,Batch No,RSV,FluA,FluB,Result
0,1.0,1.0,P-1,12403015541,245 Zm,,,,Negative
1,2.0,2.0,P-1,12403015651,245 Zm,,25.9,,Positive
2,3.0,3.0,P-1,12403014652,245 Zm,,,,Negative
3,4.0,4.0,P-1,12403013713,245 Zm,,,,Negative
4,5.0,5.0,P-5,12403014732,245 Zm,,,,Negative
...,...,...,...,...,...,...,...,...,...
1048339,,,,,,,,,
1048340,,,,,,,,,
1048341,,,,,,,,,
1048342,,,,,,,,,


PCR SL           float64
Ext. Batch SL    float64
Pool Set          object
Sample_ID         object
Batch No          object
RSV              float64
FluA             float64
FluB             float64
Result            object
dtype: object


Unnamed: 0,Sl no,Collection Date,Study ID,Name,Phone no.,Relation with contact person,Lab_ID,Unnamed: 7,Collecting nurse,Results,Test date,Informed results by call,Informed results by text,Name of RA,Remarks,Enroll in COI (Y/N),Reason for non enrollment,Linking done (Y/N)
0,1,19/11/22,1160005717,elham,,,,Sick,,,,,,,,,,Y
1,2,19/11/22,1250004971,fariya,1730947159,father,12211023522,,mahbuba,Negative,21/11/2022,,Ok,Shahina,,Y,,Y
2,3,19/11/22,1250004973,saifullah,1301757019,father,12211023562,,mahbuba,Negative,21/11/2022,,Ok,Shahina,,Y,,Y
3,4,19/11/22,1070004622,anas,1711134089,father,12211025691,,mahbuba,RSV (19.70),21/11/2022,Ok,Ok,Shahina,,,,Y
4,5,19/11/22,1070004623,avary,1304730485,father,12211025711,,mahbuba,Negative,21/11/2022,,Ok,Shahina,,,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9472,,9/4/24,,,,,,,Surovi,,,,,,,,,
9473,,9/4/24,,,,,,,Surovi,,,,,,,,,
9474,,9/4/24,,,,,,,,,,,,,,,,
9475,,9/4/24,,,,,,,,,,,,,,,,


Sl no                           object
Collection Date                 object
Study ID                        object
Name                            object
Phone no.                       object
Relation with contact person    object
Lab_ID                          object
Unnamed: 7                      object
Collecting nurse                object
Results                         object
Test date                       object
Informed results by call        object
Informed results by text        object
Name of RA                      object
Remarks                         object
Enroll in COI (Y/N)             object
Reason for non enrollment       object
Linking done (Y/N)              object
dtype: object


In [3]:
result = df_rib.dtypes

print("Output:")
print(result)

Output:
Sl no                           object
Collection Date                 object
Study ID                        object
Name                            object
Phone no.                       object
Relation with contact person    object
Lab_ID                          object
Unnamed: 7                      object
Collecting nurse                object
Results                         object
Test date                       object
Informed results by call        object
Informed results by text        object
Name of RA                      object
Remarks                         object
Enroll in COI (Y/N)             object
Reason for non enrollment       object
Linking done (Y/N)              object
dtype: object


In [None]:
# Strip extra whitespace from column names in df_rib
df_rib.columns = df_rib.columns.str.strip()

# Verify the changes
print(df_rib.columns)


In [None]:
# Perform a left join to merge df_rib's 'Results' into df_pool based on matching 'Sample_ID' and 'Lab_ID'
df_pool = pd.merge(df_pool, df_rib[['Lab_ID', 'Results']], how='left', left_on='Sample_ID', right_on='Lab_ID')

# Rename the 'Results' column to 'PCR_Results'
df_pool.rename(columns={'Results': 'PCR_Results'}, inplace=True)

# Drop the extra 'Lab_ID' column if not needed
df_pool.drop('Lab_ID', axis=1, inplace=True)

# Display the first few rows to verify the merge and new column
print(df_pool.head())

In [None]:
print("df_pool columns:", df_pool.columns)
print("df_rib columns:", df_rib.columns)



In [None]:
# Merging dataframes based on 'Sample_ID' from df_pool and 'Lab_ID' from df_rib
merged_df = pd.merge(df_pool, df_rib, how='inner', left_on='Sample_ID', right_on='Lab_ID')

# Display the first few rows of the merged dataframe to verify the results
# print(merged_df.head())


In [None]:
# Assuming 'Sample ID' and 'Lab ID' are the names in both dataframes
merged_df = pd.merge(df_pool, df_rib, how='inner', on=['Sample_ID', 'Lab_ID'])
