In [14]:
import pandas as pd

import matplotlib.pyplot as plt

# Plotting settings
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False

# Configuration parameters
FILE_PATH = '../../data/raw/train.txt'
MAX_ROWS = 10000

# Define column names
cols = ['label'] + [f'I{i}' for i in range(1, 14)] + [f'C{i}' for i in range(1, 27)]
num_features = [f'I{i}' for i in range(1, 14)]

try:
    # Read data
    data = pd.read_csv(
        FILE_PATH,
        sep='\t',
        header=None,
        names=cols,
        nrows=MAX_ROWS
    )

    print(f"Data dimensions: {data.shape}")
    display(data.head().style.background_gradient(subset=num_features, cmap='Blues'))

    # Calculate statistics for each column
    stats = []
    total_memory = data.memory_usage(deep=True).sum()
    total_rows = len(data)  # Total number of samples for calculating missing ratio

    for col in data.columns:
        # Feature type
        dtype = str(data[col].dtype)

        # Calculate cardinality (number of distinct values, excluding missing values)
        cardinality = data[col].nunique(dropna=True)

        # Memory usage (in bytes)
        memory_usage = data[col].memory_usage(deep=True)

        # Memory usage ratio
        memory_ratio = (memory_usage / total_memory) * 100

        # Missing value statistics
        missing_count = data[col].isna().sum()  # Number of missing values
        missing_ratio = (missing_count / total_rows) * 100  # Missing value ratio (%)

        stats.append({
            'Feature': col,
            'Type': dtype,
            'Cardinality': cardinality,
            'Missing Count': missing_count,
            'Missing Ratio(%)': round(missing_ratio, 2),
            'Memory Usage(MB)': round(memory_usage / 1024 / 1024, 2),
            'Memory Ratio(%)': round(memory_ratio, 2)
        })

    # Convert to DataFrame and display
    stats_df = pd.DataFrame(stats)

    # Sort by missing ratio in descending order (can modify sorting field as needed)
    stats_df = stats_df.sort_values(by='Missing Ratio(%)', ascending=False)

    # Display results
    print("\nFeature Statistics (including missing value analysis):")
    display(stats_df.style
            .bar(subset=['Missing Ratio(%)'], color='salmon')  # Mark missing ratio with red bars
            .bar(subset=['Memory Ratio(%)'], color='lightblue')
            .background_gradient(subset=['Cardinality'], cmap='viridis'))

except FileNotFoundError:
    print(f"Error: File {FILE_PATH} not found, please check if the path is correct")
except Exception as e:
    print(f"An error occurred during data processing: {str(e)}")


Data dimensions: (10000, 40)


Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,1.0,2.0,,2.0,68fd1e64,80e26c9b,fb936136,7b4723c4,25c83c98,7e0ccccf,de7995b8,1f89b562,a73ee510,a8cd5504,b2cb9c98,37c9c164,2824a5f6,1adce6ef,8ba8b39a,891b62e7,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,1.0,1.0,,4.0,68fd1e64,f0cf0024,6f67f7e5,41274cd7,25c83c98,fe6b92e5,922afcc0,0b153874,a73ee510,2b53e5fb,4f1b46f3,623049e6,d7020589,b28479f6,e6c5b5cd,c92f3b61,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,1.0,3.0,3.0,45.0,287e684f,0a519c5c,02cf9876,c18be181,25c83c98,7e0ccccf,c78204a1,0b153874,a73ee510,3b08e48b,5f5e6091,8fe001f4,aa655a2f,07d13a8f,6dc710ed,36103458,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,,0.0,,,68fd1e64,2c16a946,a9a87e68,2e17d6f6,25c83c98,fe6b92e5,2e8a689b,0b153874,a73ee510,efea433b,e51ddf94,a30567ca,3516f6e6,07d13a8f,18231224,52b8680f,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,1.0,1.0,,0.0,8cf07265,ae46a29d,c81688bb,f922efad,25c83c98,13718bbd,ad9fa255,0b153874,a73ee510,5282c137,e5d8af57,66a76a26,f06c53ac,1adce6ef,8ff4b403,01adbab4,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,



Feature Statistics (including missing value analysis):


Unnamed: 0,Feature,Type,Cardinality,Missing Count,Missing Ratio(%),Memory Usage(MB),Memory Ratio(%)
35,C22,object,7,8182,81.82,0.36,2.23
12,I12,float64,26,7735,77.35,0.08,0.47
33,C20,object,3,4496,44.96,0.48,2.94
32,C19,object,546,4496,44.96,0.48,2.94
39,C26,object,1938,4496,44.96,0.48,2.94
38,C25,object,39,4496,44.96,0.48,2.94
10,I10,float64,6,4481,44.81,0.08,0.47
1,I1,float64,76,4481,44.81,0.08,0.47
6,I6,float64,825,2511,25.11,0.08,0.47
3,I3,float64,310,2037,20.37,0.08,0.47
