In [None]:

from pathlib import Path
import pandas as pd
import numpy as np

DATASET_PATH = Path('../data/data-3/timeseries/89665.csv')

# Load the data
data = pd.read_csv(DATASET_PATH)

print("="*70)
print("DATASET OVERVIEW")
print("="*70)
print(f"Dataset shape: {data.shape}")
print(f"Total cells: {data.shape[0] * data.shape[1]:,}")
print(f"\nColumns: {list(data.columns)}")


In [None]:

# Check for NaNs in the entire dataset
total_nans = data.isna().sum().sum()
total_cells = data.shape[0] * data.shape[1]
nan_percentage = (total_nans / total_cells) * 100

print("="*70)
print("NaN SUMMARY")
print("="*70)
print(f"Total NaN values: {total_nans:,}")
print(f"Total cells: {total_cells:,}")
print(f"Percentage of NaNs: {nan_percentage:.4f}%")


In [None]:

# Check for NaNs by column
print("\n" + "="*70)
print("NaN COUNT BY COLUMN")
print("="*70)

nan_counts = data.isna().sum()
nan_percentages = (nan_counts / len(data)) * 100

nan_df = pd.DataFrame({
    'Column': data.columns,
    'NaN Count': nan_counts.values,
    'NaN Percentage': nan_percentages.values,
    'Non-NaN Count': len(data) - nan_counts.values
}).sort_values('NaN Count', ascending=False)

print(nan_df.to_string(index=False))


In [None]:

# Show columns with NaNs
columns_with_nans = nan_df[nan_df['NaN Count'] > 0]

if len(columns_with_nans) > 0:
    print("\n" + "="*70)
    print("COLUMNS WITH NaN VALUES")
    print("="*70)
    print(columns_with_nans.to_string(index=False))
else:
    print("\n" + "="*70)
    print("NO NaN VALUES FOUND IN ANY COLUMN")
    print("="*70)


In [None]:

# Check for rows with any NaN values
rows_with_nans = data.isna().any(axis=1)
num_rows_with_nans = rows_with_nans.sum()
rows_percentage = (num_rows_with_nans / len(data)) * 100

print("\n" + "="*70)
print("ROWS WITH NaN VALUES")
print("="*70)
print(f"Number of rows with at least one NaN: {num_rows_with_nans:,}")
print(f"Percentage of rows affected: {rows_percentage:.4f}%")
print(f"Number of complete rows (no NaNs): {len(data) - num_rows_with_nans:,}")


In [None]:

# If there are NaNs, show where they appear (first few instances)
if total_nans > 0:
    print("\n" + "="*70)
    print("SAMPLE OF ROWS WITH NaN VALUES (First 10)")
    print("="*70)

    rows_with_nans_data = data[rows_with_nans].head(10)
    print(rows_with_nans_data)

    # Show index positions of rows with NaNs
    print("\n" + "="*70)
    print("ROW INDICES WITH NaN VALUES (First 20)")
    print("="*70)
    nan_indices = data[rows_with_nans].index.tolist()[:20]
    print(f"Indices: {nan_indices}")

    if num_rows_with_nans > 20:
        print(f"... and {num_rows_with_nans - 20:,} more rows")


In [None]:

# Create a heatmap-style visualization showing NaN patterns
if total_nans > 0:
    import matplotlib.pyplot as plt

    # Limit to columns with NaNs for better visualization
    cols_with_nans = nan_counts[nan_counts > 0].index.tolist()

    if len(cols_with_nans) > 0:
        print("\n" + "="*70)
        print("NaN PATTERN VISUALIZATION")
        print("="*70)

        # Take a sample if dataset is too large
        sample_size = min(1000, len(data))
        data_sample = data[cols_with_nans].head(sample_size)

        fig, ax = plt.subplots(figsize=(12, 8))

        # Create a binary matrix (1 for NaN, 0 for not NaN)
        nan_matrix = data_sample.isna().astype(int)

        im = ax.imshow(nan_matrix.T, aspect='auto', cmap='RdYlGn_r', interpolation='nearest')

        ax.set_xlabel('Row Index', fontsize=12, fontweight='bold')
        ax.set_ylabel('Column', fontsize=12, fontweight='bold')
        ax.set_title(f'NaN Pattern (First {sample_size} rows, columns with NaNs only)',
                     fontsize=14, fontweight='bold')

        ax.set_yticks(range(len(cols_with_nans)))
        ax.set_yticklabels(cols_with_nans)

        plt.colorbar(im, ax=ax, label='1 = NaN, 0 = Valid')
        plt.tight_layout()
        plt.show()


In [None]:

# Data type information
print("\n" + "="*70)
print("DATA TYPES")
print("="*70)
print(data.dtypes)


In [None]:

# Check for infinite values as well
print("\n" + "="*70)
print("INFINITE VALUES CHECK")
print("="*70)

numeric_cols = data.select_dtypes(include=[np.number]).columns

if len(numeric_cols) > 0:
    inf_counts = {}
    for col in numeric_cols:
        inf_count = np.isinf(data[col]).sum()
        if inf_count > 0:
            inf_counts[col] = inf_count

    if inf_counts:
        print("Columns with infinite values:")
        for col, count in inf_counts.items():
            print(f"  {col}: {count} infinite values")
    else:
        print("No infinite values found in numeric columns")
else:
    print("No numeric columns to check for infinite values")


In [None]:

# Summary statistics to help understand data quality
print("\n" + "="*70)
print("DATA QUALITY SUMMARY")
print("="*70)
print(data.describe())

