# Data Cleaning and Quality Analysis

This notebook focuses on loading the BBBP dataset, validating SMILES strings, handling missing values, and performing comprehensive data quality analysis.

## Objectives:
- Load and explore the BBBP dataset
- Validate SMILES strings using RDKit
- Identify and handle missing values
- Analyze data quality and distribution
- Prepare clean dataset for descriptor calculation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Draw
import sys
import os

# Add src directory to path
sys.path.append('../src')

from data_handler import DataHandler

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Load BBBP Dataset

In [None]:
# Initialize data handler
data_handler = DataHandler()

# Load the BBBP dataset
print("Loading BBBP dataset...")
df = data_handler.load_bbbp_data()

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Display first few rows
df.head()

## 2. Basic Dataset Information

In [None]:
# Basic dataset information
print("=== Dataset Information ===")
print(f"Total number of compounds: {len(df)}")
print(f"Number of features: {df.shape[1]}")
print("\n=== Data Types ===")
print(df.dtypes)
print("\n=== Missing Values ===")
print(df.isnull().sum())
print("\n=== Basic Statistics ===")
df.describe()

## 3. Class Distribution Analysis

In [None]:
# Analyze class distribution
class_counts = df['p_np'].value_counts()
class_percentages = df['p_np'].value_counts(normalize=True) * 100

print("=== BBB Permeability Class Distribution ===")
print(f"Non-permeable (0): {class_counts[0]} ({class_percentages[0]:.1f}%)")
print(f"Permeable (1): {class_counts[1]} ({class_percentages[1]:.1f}%)")

# Visualize class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot
class_counts.plot(kind='bar', ax=ax1, color=['lightcoral', 'lightblue'])
ax1.set_title('BBB Permeability Class Distribution')
ax1.set_xlabel('Permeability Class')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['Non-permeable', 'Permeable'], rotation=0)

# Pie chart
ax2.pie(class_counts.values, labels=['Non-permeable', 'Permeable'], 
        autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
ax2.set_title('BBB Permeability Distribution')

plt.tight_layout()
plt.show()

## 4. SMILES Validation

In [None]:
# Validate SMILES strings
print("Validating SMILES strings...")

valid_smiles = []
invalid_smiles = []
mol_objects = []

for idx, smiles in enumerate(df['smiles']):
    is_valid, mol = data_handler.validate_smiles(smiles)
    if is_valid:
        valid_smiles.append(idx)
        mol_objects.append(mol)
    else:
        invalid_smiles.append(idx)
        mol_objects.append(None)

# Add molecule objects to dataframe
df['mol_object'] = mol_objects

print(f"Valid SMILES: {len(valid_smiles)} ({len(valid_smiles)/len(df)*100:.1f}%)")
print(f"Invalid SMILES: {len(invalid_smiles)} ({len(invalid_smiles)/len(df)*100:.1f}%)")

if invalid_smiles:
    print("\nInvalid SMILES examples:")
    for idx in invalid_smiles[:5]:  # Show first 5 invalid SMILES
        print(f"Index {idx}: {df.loc[idx, 'smiles']}")

## 5. Data Quality Assessment

In [None]:
# Check for duplicates
print("=== Duplicate Analysis ===")
duplicate_smiles = df['smiles'].duplicated().sum()
duplicate_names = df['name'].duplicated().sum()

print(f"Duplicate SMILES: {duplicate_smiles}")
print(f"Duplicate names: {duplicate_names}")

if duplicate_smiles > 0:
    print("\nDuplicate SMILES examples:")
    duplicates = df[df['smiles'].duplicated(keep=False)].sort_values('smiles')
    print(duplicates[['name', 'smiles', 'p_np']].head(10))

# Check SMILES length distribution
print("\n=== SMILES Length Analysis ===")
df['smiles_length'] = df['smiles'].str.len()
print(f"SMILES length - Min: {df['smiles_length'].min()}, Max: {df['smiles_length'].max()}, Mean: {df['smiles_length'].mean():.1f}")

# Plot SMILES length distribution
plt.figure(figsize=(10, 6))
plt.hist(df['smiles_length'], bins=50, alpha=0.7, edgecolor='black')
plt.xlabel('SMILES Length')
plt.ylabel('Frequency')
plt.title('Distribution of SMILES String Lengths')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Sample Molecular Structures

In [None]:
# Display sample molecular structures
print("Sample molecular structures:")

# Get samples from each class
permeable_samples = df[df['p_np'] == 1].head(3)
non_permeable_samples = df[df['p_np'] == 0].head(3)

print("\n=== BBB Permeable Molecules ===")
for idx, row in permeable_samples.iterrows():
    if row['mol_object'] is not None:
        print(f"Name: {row['name']}")
        print(f"SMILES: {row['smiles']}")
        display(Draw.MolToImage(row['mol_object'], size=(300, 300)))
        print("-" * 50)

print("\n=== BBB Non-Permeable Molecules ===")
for idx, row in non_permeable_samples.iterrows():
    if row['mol_object'] is not None:
        print(f"Name: {row['name']}")
        print(f"SMILES: {row['smiles']}")
        display(Draw.MolToImage(row['mol_object'], size=(300, 300)))
        print("-" * 50)

## 7. Data Cleaning and Preparation

In [None]:
# Create clean dataset
print("Preparing clean dataset...")

# Remove rows with invalid SMILES
clean_df = df[df['mol_object'].notna()].copy()

# Remove duplicates based on SMILES (keep first occurrence)
initial_count = len(clean_df)
clean_df = clean_df.drop_duplicates(subset=['smiles'], keep='first')
duplicates_removed = initial_count - len(clean_df)

print(f"Original dataset: {len(df)} compounds")
print(f"After removing invalid SMILES: {len(df[df['mol_object'].notna()])} compounds")
print(f"After removing duplicates: {len(clean_df)} compounds")
print(f"Total compounds removed: {len(df) - len(clean_df)}")

# Final class distribution
final_class_counts = clean_df['p_np'].value_counts()
final_class_percentages = clean_df['p_np'].value_counts(normalize=True) * 100

print("\n=== Final Class Distribution ===")
print(f"Non-permeable (0): {final_class_counts[0]} ({final_class_percentages[0]:.1f}%)")
print(f"Permeable (1): {final_class_counts[1]} ({final_class_percentages[1]:.1f}%)")

# Save clean dataset
clean_df.to_csv('../data/BBBP_clean.csv', index=False)
print("\nClean dataset saved to '../data/BBBP_clean.csv'")

## 8. Data Quality Summary

In [None]:
# Create comprehensive data quality report
print("=== DATA QUALITY SUMMARY ===")
print(f"Original dataset size: {len(df)} compounds")
print(f"Clean dataset size: {len(clean_df)} compounds")
print(f"Data retention rate: {len(clean_df)/len(df)*100:.1f}%")
print(f"\nInvalid SMILES removed: {len(invalid_smiles)}")
print(f"Duplicate SMILES removed: {duplicates_removed}")
print(f"\nFinal class balance:")
print(f"  - Non-permeable: {final_class_counts[0]} ({final_class_percentages[0]:.1f}%)")
print(f"  - Permeable: {final_class_counts[1]} ({final_class_percentages[1]:.1f}%)")
print(f"\nSMILES length statistics:")
print(f"  - Min: {clean_df['smiles_length'].min()}")
print(f"  - Max: {clean_df['smiles_length'].max()}")
print(f"  - Mean: {clean_df['smiles_length'].mean():.1f}")
print(f"  - Std: {clean_df['smiles_length'].std():.1f}")

print("\n=== RECOMMENDATIONS ===")
if len(invalid_smiles) > 0:
    print(f"- {len(invalid_smiles)} invalid SMILES were removed from the dataset")
if duplicates_removed > 0:
    print(f"- {duplicates_removed} duplicate compounds were removed")
if abs(final_class_percentages[0] - 50) > 10:
    print(f"- Dataset shows class imbalance ({final_class_percentages[0]:.1f}% vs {final_class_percentages[1]:.1f}%)")
    print("  Consider using stratified sampling or class balancing techniques")
print("- Dataset is ready for molecular descriptor calculation")
print("- Proceed to notebook 02_descriptor_summary.ipynb for descriptor analysis")