In [11]:
import pandas as pd
import os

In [12]:
def filter_ticker(df, filename):
    # Check if required columns exist
    required_columns = ['Ticker', 'Time', 'Date']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Warning: Missing columns in {filename}: {', '.join(missing_columns)}")
        return None

    # Filter and process the dataframe
    df = df = df[
        df['Ticker'].str.match(r'BANKNIFTY-I(\.NFO)?$', na=False) &
        ~df['Ticker'].str.contains('CE', na=False) &
        ~df['Ticker'].str.contains('PE', na=False)
    ]
    
    df = df.sort_values(by=['Time', 'Ticker', 'Date'])
    
    # Drop 'Volume' and 'Open Interest' if they exist
    columns_to_drop = ['Volume', 'Open Interest']
    existing_columns = [col for col in columns_to_drop if col in df.columns]
    df = df.drop(columns=existing_columns)

    # Add filename column
    df['Filename'] = filename

    return df

In [13]:
path = 'data/Raw GFDL since 2011'
output_path = 'output'
visited_file = 'output/visited.csv'

# Read the visited files CSV
file_visited_df = pd.read_csv(visited_file)

# Ensure the output directory exists
os.makedirs(output_path, exist_ok=True)

for (dirpath, dirnames, filenames) in os.walk(path):
    for filename in filenames:
        if filename in file_visited_df['filename'].values:
            continue  # Skip this file if it's already been processed
        
        current_file = os.path.join(dirpath, filename)
        try:
            df = pd.read_csv(current_file, encoding='utf-8')
            df_modified = filter_ticker(df, filename)
            
            if df_modified is not None and not df_modified.empty:
                # Append the modified data to the output CSV
                output_file = os.path.join(output_path, 'out.csv')
                df_modified.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False, encoding='utf-8')
                
                # Update file_visited_df with the new filename
                new_row = pd.DataFrame({'filename': [filename]})
                file_visited_df = pd.concat([file_visited_df, new_row], ignore_index=True)
                
                print(f"File processed: {filename}")
            else:
                print(f"No data to write after filtering: {filename}")
        except Exception as e:
            print(f"Error processing file {filename}: {str(e)}")

# Save the updated file_visited_df
file_visited_df.to_csv(visited_file, index=False)
print("All files processed. Updated visited files list saved.")

File processed: GFDLNFO_BACKADJUSTED_03072024.csv
File processed: GFDLNFO_BACKADJUSTED_08072024.csv
File processed: GFDLNFO_BACKADJUSTED_11072024.csv
File processed: GFDLNFO_BACKADJUSTED_05072024.csv
File processed: GFDLNFO_BACKADJUSTED_02072024.csv
File processed: GFDLNFO_BACKADJUSTED_09072024.csv
File processed: GFDLNFO_BACKADJUSTED_04072024.csv
File processed: GFDLNFO_BACKADJUSTED_10072024.csv
File processed: GFDLNFO_BACKADJUSTED_15072024.csv
File processed: GFDLNFO_BACKADJUSTED_01072024.csv
File processed: GFDLNFO_BACKADJUSTED_12072024.csv
File processed: NSEFO_19082011.csv
File processed: NSEFO_12082011.csv
File processed: NSEFO_01082011.csv
File processed: NSEFO_18082011.csv
File processed: NSEFO_26082011.csv
File processed: NSEFO_23082011.csv
File processed: NSEFO_16082011.csv
File processed: NSEFO_04082011.csv
File processed: NSEFO_10082011.csv
File processed: NSEFO_25082011.csv
File processed: NSEFO_09082011.csv
File processed: NSEFO_22082011.csv
File processed: NSEFO_17082011