In [None]:
pip install pandas numpy




In [None]:
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Pool
import os


In [None]:
def load_data_in_chunks(file_path, chunk_size=100000):
    chunk_iter = pd.read_csv(file_path, chunksize=chunk_size)
    return chunk_iter

file_path = '/financial_risk_analysis_large.csv'
chunks = load_data_in_chunks(file_path)


In [None]:
def handle_missing_values(chunk):
    chunk.fillna(chunk.mean(), inplace=True)
    return chunk


In [None]:
def feature_engineering(chunk):
    chunk['DebtToIncomeRatio'] = chunk['MonthlyDebtPayments'] / chunk['AnnualIncome']
    return chunk


In [None]:
def process_chunk(chunk):

    chunk = handle_missing_values(chunk)
    chunk = feature_engineering(chunk)
    return chunk


In [None]:
def handle_missing_values(chunk):
    numeric_cols = chunk.select_dtypes(include='number').columns
    chunk[numeric_cols] = chunk[numeric_cols].fillna(chunk[numeric_cols].mean())
    return chunk


In [None]:
import pandas as pd
from multiprocessing import Pool, cpu_count

# Load in chunks
def load_data_in_chunks(file_path, chunk_size=100000):
    return pd.read_csv(file_path, chunksize=chunk_size)

# Process one chunk (parallelized)
def process_chunk(chunk):
    return handle_missing_values(chunk)

# Fix missing values only in numeric columns
def handle_missing_values(chunk):
    numeric_cols = chunk.select_dtypes(include='number').columns
    chunk[numeric_cols] = chunk[numeric_cols].fillna(chunk[numeric_cols].mean())
    return chunk

# Parallel processing controller
def parallel_process_data(file_path, chunk_size=100000, num_workers=cpu_count()):
    pool = Pool(processes=num_workers)
    chunk_iter = load_data_in_chunks(file_path, chunk_size)
    processed_chunks = pool.map(process_chunk, chunk_iter)
    pool.close()
    pool.join()
    return pd.concat(processed_chunks, ignore_index=True)

# Set the correct file path
file_path = '/financial_risk_analysis_large.csv'

# Run it
processed_data = parallel_process_data(file_path)


In [None]:
import time
start = time.time()
processed_data = parallel_process_data(file_path)
end = time.time()
print(f"Total time: {end - start:.2f} seconds")


Total time: 13.18 seconds


In [None]:
pip install psutil




In [None]:
import psutil

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024  # Memory in MB

# Example: Before and after processing memory usage
print(f"Before processing: {get_memory_usage()} MB")
processed_data = parallel_process_data(file_path)
print(f"After processing: {get_memory_usage()} MB")


Before processing: 1179.9140625 MB
After processing: 1234.2265625 MB


In [None]:
processed_data.to_csv('processed_financial_risk_data.csv', index=False)


In [None]:
test_chunk = pd.read_csv(file_path, nrows=10000)
test_data = process_chunk(test_chunk)
test_data.to_csv('test_processed_data.csv', index=False)


In [None]:
import time

start_time = time.time()
sequential_data = pd.concat([process_chunk(chunk) for chunk in load_data_in_chunks(file_path)], ignore_index=True)
end_time = time.time()
print(f"Sequential processing took {end_time - start_time:.2f} seconds")

start_time = time.time()
parallel_data = parallel_process_data(file_path)
end_time = time.time()
print(f"Parallel processing took {end_time - start_time:.2f} seconds")


Sequential processing took 6.94 seconds
Parallel processing took 12.67 seconds
