In [2]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv(r"D:\DATA ANALYSIS PROJECTS\CERDIT CARD\credit_card.csv")

# ---------- 1. Date Formatting ----------
# Convert Week_Start_Date to datetime
df['Week_Start_Date'] = pd.to_datetime(df['Week_Start_Date'], errors='coerce', dayfirst=True)

# ---------- 2. Handle Missing Values ----------
# Fill numeric missing values with median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].apply(lambda x: x.fillna(x.median()))

# Fill categorical missing values with 'Unknown'
cat_cols = df.select_dtypes(exclude=[np.number]).columns
df[cat_cols] = df[cat_cols].fillna('Unknown')

# ---------- 3. Standardize Categories ----------
# Strip spaces and make consistent case for categorical columns
for col in ['Card_Category', 'Use Chip', 'Exp Type']:
    df[col] = df[col].str.strip().str.title()

# ---------- 4. Convert Data Types ----------
# Ensure numeric columns are correct type
df['Annual_Fees'] = pd.to_numeric(df['Annual_Fees'], errors='coerce')
df['Customer_Acq_Cost'] = pd.to_numeric(df['Customer_Acq_Cost'], errors='coerce')
df['Credit_Limit'] = pd.to_numeric(df['Credit_Limit'], errors='coerce')
df['Total_Revolving_Bal'] = pd.to_numeric(df['Total_Revolving_Bal'], errors='coerce')
df['Total_Trans_Amt'] = pd.to_numeric(df['Total_Trans_Amt'], errors='coerce')
df['Total_Trans_Vol'] = pd.to_numeric(df['Total_Trans_Vol'], errors='coerce')
df['Avg_Utilization_Ratio'] = pd.to_numeric(df['Avg_Utilization_Ratio'], errors='coerce')
df['Interest_Earned'] = pd.to_numeric(df['Interest_Earned'], errors='coerce')
df['Delinquent_Acc'] = pd.to_numeric(df['Delinquent_Acc'], errors='coerce')

# ---------- 5. Create New Useful Columns ----------
# Activation Status
df['Activation_Status'] = df['Activation_30_Days'].apply(lambda x: 'Activated' if x == 1 else 'Not Activated')

# Total Revenue (Annual Fees + Interest)
df['Total_Revenue'] = df['Annual_Fees'] + df['Interest_Earned']

# Quarter-Year column
df['Quarter_Year'] = df['Qtr'].astype(str) + '-' + df['current_year'].astype(str)

# ---------- 6. Remove Duplicates ----------
df.drop_duplicates(inplace=True)

# ---------- 7. Save Cleaned Data ----------
df.to_csv("credit_card_cleaned.csv", index=False)

print("Data cleaned and saved as 'credit_card_cleaned.csv'")


Data cleaned and saved as 'credit_card_cleaned.csv'


In [4]:
# Save cleaned DataFrame to CSV
df.to_csv(r"D:\DATA ANALYSIS PROJECTS\CERDIT CARD\credit_card.csv", index=False)

print("✅ Cleaned data saved as 'credit_card_cleaned.csv'")


✅ Cleaned data saved as 'credit_card_cleaned.csv'


In [6]:
import os

# Path to Downloads
downloads_path = os.path.join(os.path.expanduser("~"), "Downloads", "credit_card_cleaned.csv")

# Save file
df.to_csv(downloads_path, index=False)

print(f"✅ File saved to: {downloads_path}")


✅ File saved to: C:\Users\Sudip\Downloads\credit_card_cleaned.csv
