
# ExoHabitAI ‚Äì Module 1: Data Collection & Management

üìå **Objective**  
Safely load raw NASA Exoplanet Archive data, standardize it, validate quality,
and store it for downstream processing.

‚ö†Ô∏è Logic is **identical** to the original implementation.
Only the notebook structure is changed for explanation.


## 1Ô∏è‚É£ Imports

In [15]:
import pandas as pd
import numpy as np
import os
import sqlite3
from datetime import datetime
import warnings
import csv
warnings.filterwarnings('ignore')


## 2Ô∏è‚É£ ExoplanetDataCollector Class

Handles:
- Irregular NASA CSV loading
- Column standardization
- Schema validation
- Database & CSV storage


### 2.1 `__init__()` ‚Äì Initialize Collector

In [16]:
import os
import csv
import sqlite3
import numpy as np
import pandas as pd


class ExoplanetDataCollector:
    def __init__(self, db_path='exoplanet_data.db'):
        self.db_path = db_path
        self.data_source = "Unknown"

    def load_nasa_csv_safe(self, filepath):
        try:
            df = pd.read_csv(
                filepath,
                comment='#',
                low_memory=False
            )

            df.columns = [c.strip() for c in df.columns]
            self.data_source = "NASA Exoplanet Archive CSV"
            return df

        except Exception:
            return self.load_csv_manually(filepath)

    def load_csv_manually(self, filepath):
        data, headers = [], []

        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)

            for row in reader:
                if any('pl_name' in str(c).lower() for c in row):
                    headers = row
                    break

            for row in reader:
                if len(row) < len(headers):
                    row += [''] * (len(headers) - len(row))
                data.append(row)

        return pd.DataFrame(data, columns=headers)

    def load_local_data(self, filepath):
        if not os.path.exists(filepath):
            return None

        df = self.load_nasa_csv_safe(filepath)

        if df is not None and not df.empty:
            df = self.standardize_column_names(df)
            df = self.filter_for_required_columns(df)
            return df

        return None

    def standardize_column_names(self, df):
        df.columns = [
            str(c).lower()
            .replace(' ', '_')
            .replace('[', '')
            .replace(']', '')
            for c in df.columns
        ]
        return df

    def filter_for_required_columns(self, df):
        required = [
            'pl_name', 'hostname', 'pl_rade', 'pl_bmasse', 'pl_orbper',
            'pl_orbsmax', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass',
            'sy_dist', 'disc_year', 'discoverymethod'
        ]

        for c in required:
            if c not in df.columns:
                df[c] = np.nan

        return df

    def validate_schema(self, df):
        for c in ['pl_name', 'pl_rade', 'pl_bmasse']:
            if c not in df.columns:
                return False
        return True

    def clean_data(self, df):
        df = df[df['pl_name'].notna()].copy()

        numeric_cols = [
            'pl_rade', 'pl_bmasse', 'pl_orbper', 'pl_orbsmax',
            'pl_eqt', 'st_teff', 'st_rad', 'st_mass', 'sy_dist'
        ]

        for c in numeric_cols:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce')

        df['pl_dens'] = df['pl_bmasse'] / (df['pl_rade'] ** 3)

        return df

    def save_to_database(self, df):
        conn = sqlite3.connect(self.db_path)
        df.to_sql('exoplanets_raw', conn, if_exists='replace', index=False)
        conn.close()

    def save_to_csv(self, df):
        df.to_csv('exoplanets_processed.csv', index=False)
        df.head(100).to_csv('exoplanets_sample.csv', index=False)

    def get_data_summary(self, df):
        print('Total planets:', len(df))
        print('Columns:', len(df.columns))
        print('Radius range:', df['pl_rade'].min(), '‚Üí', df['pl_rade'].max())



### 2.2 `load_nasa_csv_safe()`

Attempts to safely load NASA CSV files that may:
- Contain metadata
- Have inconsistent headers
- Include commented lines


In [17]:

    def load_nasa_csv_safe(self, filepath):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            data_start_line = 0
            for i, line in enumerate(lines):
                if any(k in line.lower() for k in ['pl_name', 'hostname']):
                    data_start_line = i
                    break

            if data_start_line == 0:
                df = pd.read_csv(filepath, comment='#', low_memory=False)
            else:
                df = pd.read_csv(filepath, skiprows=data_start_line, low_memory=False)

            df.columns = [c.strip() for c in df.columns]
            self.data_source = "NASA Exoplanet Archive CSV"
            return df
        except:
            return self.load_csv_manually(filepath)


### 2.3 `load_csv_manually()`

Fallback loader using Python's CSV reader when pandas fails.


In [18]:
    def load_csv_manually(self, filepath):
        data, headers = [], []
        with open(filepath, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            for row in reader:
                if any('pl_name' in str(c).lower() for c in row):
                    headers = row
                    break
            for row in reader:
                if len(row) < len(headers):
                    row += [''] * (len(headers) - len(row))
                data.append(row)
        return pd.DataFrame(data, columns=headers)


### 2.4 `load_local_data()`

Tries multiple strategies to load CSV from disk safely.


In [19]:
    def load_local_data(self, filepath):
        if not os.path.exists(filepath):
            return None
        df = self.load_nasa_csv_safe(filepath)
        if df is not None and not df.empty:
            df = self.standardize_column_names(df)
            df = self.filter_for_required_columns(df)
            return df
        return None


### 2.5 `standardize_column_names()`

Maps different NASA column names into a consistent internal schema.


In [20]:
def standardize_column_names(self, df):
        df.columns = [str(c).lower().replace(' ', '_').replace('[','').replace(']','') for c in df.columns]
        return df


### 2.6 `filter_for_required_columns()`

Ensures core scientific features exist; fills missing ones with NaN.


In [21]:
    def filter_for_required_columns(self, df):
        required = ['pl_name','hostname','pl_rade','pl_bmasse','pl_orbper',
                    'pl_orbsmax','pl_eqt','st_teff','st_rad','st_mass','sy_dist',
                    'disc_year','discoverymethod']
        for c in required:
            if c not in df.columns:
                df[c] = np.nan
        return df


### 2.7 `validate_schema()`

Checks if minimum viable scientific data exists.


In [22]:
    def validate_schema(self, df):
        for c in ['pl_name','pl_rade','pl_bmasse']:
            if c not in df.columns:
                return False
        return True


### 2.8 `clean_data()`

Removes invalid rows, converts numerics, computes planet density.


In [23]:
    def clean_data(self, df):
        df = df[df['pl_name'].notna()].copy()
        for c in ['pl_rade','pl_bmasse','pl_orbper','pl_orbsmax','pl_eqt','st_teff','st_rad','st_mass','sy_dist']:
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors='coerce')
        df['pl_dens'] = df['pl_bmasse'] / (df['pl_rade'] ** 3)
        return df


### 2.9 `save_to_database()`

Stores cleaned data into SQLite for reuse.


In [24]:
    def save_to_database(self, df):
        conn = sqlite3.connect(self.db_path)
        df.to_sql('exoplanets_raw', conn, if_exists='replace', index=False)
        conn.close()


### 2.10 `save_to_csv()`

Exports cleaned data to CSV files.


In [25]:
    def save_to_csv(self, df):
        df.to_csv('exoplanets_processed.csv', index=False)
        df.head(100).to_csv('exoplanets_sample.csv', index=False)


### 2.11 `get_data_summary()`

Prints human-readable statistics.


In [26]:
    def get_data_summary(self, df):
        print('Total planets:', len(df))
        print('Columns:', len(df.columns))
        print('Radius range:', df['pl_rade'].min(), '‚Üí', df['pl_rade'].max())

In [27]:
print(dir(ExoplanetDataCollector))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'clean_data', 'filter_for_required_columns', 'get_data_summary', 'load_csv_manually', 'load_local_data', 'load_nasa_csv_safe', 'save_to_csv', 'save_to_database', 'standardize_column_names', 'validate_schema']


## 3Ô∏è‚É£ Run Module 1 Pipeline

In [28]:
collector = ExoplanetDataCollector()

csv_path = "A:\Github\Habitability_of_Exoplanets\exoplanet.csv"  

df = collector.load_local_data(csv_path)

if df is None or df.empty:
    print("Failed to load CSV or CSV is empty")
else:
    df = collector.clean_data(df)

    if collector.validate_schema(df):
        collector.save_to_database(df)
        collector.save_to_csv(df)
        collector.get_data_summary(df)
    else:
        print("Schema validation failed")


Total planets: 39119
Columns: 94
Radius range: 0.27 ‚Üí 4282.98



## ‚úÖ Module 1 Completed

Outputs:
- exoplanet_data.db
- exoplanets_processed.csv
- exoplanets_sample.csv

‚û°Ô∏è Next: Module 2 ‚Äì Cleaning & Feature Engineering
