In [2]:
# -*- coding: utf-8 -*-
"""
Consolidated SpaceX Falcon 9 Landing Prediction Code.

This script combines steps from multiple notebooks:
1. Web Scraping from Wikipedia
2. API Data Collection from SpaceX API
3. Data Wrangling and Target Variable Creation
4. Basic EDA and Visualization (Matplotlib/Seaborn)
5. Feature Engineering (One-Hot Encoding)
6. Machine Learning Pipeline (Standardization, Train/Test Split, Model Training/Tuning, Evaluation)
7. Folium Map Visualization
"""

import sys
import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
# Corrected import for older scikit-learn versions
from sklearn.metrics import confusion_matrix, jaccard_similarity_score, f1_score
import folium
from folium.plugins import MarkerCluster, MousePosition
from folium.features import DivIcon
from math import sin, cos, sqrt, atan2, radians
import io # Required for loading data from URL in some environments

# Suppress potential warnings (optional)
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Helper Functions ---

# From Web Scraping Notebook
def date_time(table_cells):
    """
    This function returns the data and time from the HTML table cell
    Input: the element of a table data cell extracts extra row
    """
    return [data_time.strip() for data_time in list(table_cells.strings)][0:2]

def booster_version(table_cells):
    """
    This function returns the booster version from the HTML table cell
    Input: the element of a table data cell extracts extra row
    """
    out = ''.join([booster_version for i, booster_version in enumerate(table_cells.strings) if i % 2 == 0][0:-1])
    return out

def landing_status(table_cells):
    """
    This function returns the landing status from the HTML table cell
    Input: the element of a table data cell extracts extra row
    """
    out = [i for i in table_cells.strings][0]
    return out

def get_mass(table_cells):
    mass = unicodedata.normalize("NFKD", table_cells.text).strip()
    if mass:
        mass_find = mass.find("kg") # Use a different variable name
        if mass_find != -1:
             new_mass = mass[0:mass_find + 2]
        else:
             new_mass = 0 # Handle cases where 'kg' is not found
    else:
        new_mass = 0
    # Attempt to convert to float, handle potential errors
    try:
        # Extract numeric part before 'kg'
        numeric_part = new_mass.replace('kg', '').replace(',', '').strip()
        return float(numeric_part) if numeric_part else 0.0
    except:
        return 0.0 # Return 0 if conversion fails


def extract_column_from_header(row):
    """
    This function returns the column name from the HTML table header cell
    Input: the element of a table header cell extracts extra row
    """
    if row.br:
        row.br.extract()
    if row.a:
        row.a.extract()
    if row.sup:
        row.sup.extract()

    colunm_name = ' '.join(row.contents)

    # Filter the digit and empty names
    if not(colunm_name.strip().isdigit()):
        colunm_name = colunm_name.strip()
        return colunm_name

# From API Data Collection Notebook
def getBoosterVersion(data, BoosterVersionList):
    for x in data['rocket']:
        if x:
            try:
                response = requests.get("https://api.spacexdata.com/v4/rockets/" + str(x))
                response.raise_for_status() # Raise an exception for bad status codes
                response_json = response.json()
                BoosterVersionList.append(response_json.get('name', None)) # Use .get for safety
            except requests.exceptions.RequestException as e:
                print(f"API request failed for rocket {x}: {e}")
                BoosterVersionList.append(None)
            except ValueError: # Includes JSONDecodeError
                print(f"Failed to decode JSON for rocket {x}")
                BoosterVersionList.append(None)
        else:
            BoosterVersionList.append(None)

def getLaunchSite(data, LongitudeList, LatitudeList, LaunchSiteList):
    for x in data['launchpad']:
        if x:
            try:
                response = requests.get("https://api.spacexdata.com/v4/launchpads/" + str(x))
                response.raise_for_status()
                response_json = response.json()
                LongitudeList.append(response_json.get('longitude', None))
                LatitudeList.append(response_json.get('latitude', None))
                LaunchSiteList.append(response_json.get('name', None))
            except requests.exceptions.RequestException as e:
                print(f"API request failed for launchpad {x}: {e}")
                LongitudeList.append(None)
                LatitudeList.append(None)
                LaunchSiteList.append(None)
            except ValueError:
                print(f"Failed to decode JSON for launchpad {x}")
                LongitudeList.append(None)
                LatitudeList.append(None)
                LaunchSiteList.append(None)
        else:
            LongitudeList.append(None)
            LatitudeList.append(None)
            LaunchSiteList.append(None)

def getPayloadData(data, PayloadMassList, OrbitList):
    for load in data['payloads']:
        if load:
            try:
                response = requests.get("https://api.spacexdata.com/v4/payloads/" + load)
                response.raise_for_status()
                response_json = response.json()
                PayloadMassList.append(response_json.get('mass_kg', None))
                OrbitList.append(response_json.get('orbit', None))
            except requests.exceptions.RequestException as e:
                print(f"API request failed for payload {load}: {e}")
                PayloadMassList.append(None)
                OrbitList.append(None)
            except ValueError:
                print(f"Failed to decode JSON for payload {load}")
                PayloadMassList.append(None)
                OrbitList.append(None)
        else:
             PayloadMassList.append(None)
             OrbitList.append(None)

def getCoreData(data, OutcomeList, FlightsList, GridFinsList, ReusedList, LegsList, LandingPadList, BlockList, ReusedCountList, SerialList):
    for core in data['cores']:
        if core.get('core') is not None: # Use .get for safety
            try:
                response = requests.get("https://api.spacexdata.com/v4/cores/" + core['core'])
                response.raise_for_status()
                response_json = response.json()
                BlockList.append(response_json.get('block', None))
                ReusedCountList.append(response_json.get('reuse_count', None))
                SerialList.append(response_json.get('serial', None))
            except requests.exceptions.RequestException as e:
                print(f"API request failed for core {core.get('core')}: {e}")
                BlockList.append(None)
                ReusedCountList.append(None)
                SerialList.append(None)
            except ValueError:
                 print(f"Failed to decode JSON for core {core.get('core')}")
                 BlockList.append(None)
                 ReusedCountList.append(None)
                 SerialList.append(None)
        else:
            BlockList.append(None)
            ReusedCountList.append(None)
            SerialList.append(None)

        # Handle landing outcome string creation more robustly
        landing_success = core.get('landing_success')
        landing_type = core.get('landing_type')
        outcome_str = f"{landing_success} {landing_type}" if landing_success is not None and landing_type is not None else "None None"
        OutcomeList.append(outcome_str)

        FlightsList.append(core.get('flight', None))
        GridFinsList.append(core.get('gridfins', None))
        ReusedList.append(core.get('reused', None))
        LegsList.append(core.get('legs', None))
        LandingPadList.append(core.get('landpad', None))

# From Machine Learning Notebook
def plot_confusion_matrix(y, y_predict, title='Confusion Matrix'):
    """this function plots the confusion matrix"""
    cm = confusion_matrix(y, y_predict)
    plt.figure(figsize=(6, 5)) # Added figure creation
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, ax=ax, fmt='g', cmap='Blues') # Added fmt and cmap
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title(title)
    ax.xaxis.set_ticklabels(['did not land', 'landed'])
    ax.yaxis.set_ticklabels(['did not land', 'landed'])
    plt.tight_layout() # Added tight layout
    # plt.show() # Display plot immediately (optional, depends on environment)

# From Folium Notebook
def calculate_distance(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0

    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = sin(dlat / 2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance

# --- 1. Web Scraping ---
print("--- 1. Starting Web Scraping ---")
static_url = "https://en.wikipedia.org/w/index.php?title=List_of_Falcon_9_and_Falcon_Heavy_launches&oldid=1027686922"
try:
    wiki_response = requests.get(static_url)
    wiki_response.raise_for_status()
    soup = BeautifulSoup(wiki_response.text, 'html.parser')
    print(f"Wikipedia page title: {soup.title.string if soup.title else 'No Title Found'}")

    html_tables = soup.find_all('table')
    # The target table is usually the 3rd one with this specific class
    first_launch_table = None
    for table in html_tables:
        # Check if the table has the specific classes
        table_classes = table.get('class', [])
        if 'wikitable' in table_classes and 'plainrowheaders' in table_classes and 'collapsible' in table_classes:
             # Heuristic: Check for expected header content more robustly
             th_texts = [th.get_text(strip=True) for th in table.find_all('th', scope='col')]
             # Check for a few key headers that are likely to be present
             if 'Flight No.' in th_texts and 'Launch site' in th_texts and 'Payload' in th_texts and 'Boosterlanding' in ''.join(th_texts).replace('<br/>',''):
                 first_launch_table = table
                 break # Found the likely table

    if first_launch_table:
        print("Found the target launch table.")
        column_names = []
        # Find the first 'tr' which usually contains the headers
        header_row = first_launch_table.find('tr')
        if header_row:
            for element in header_row.find_all('th'):
                name = extract_column_from_header(element)
                if name is not None and len(name) > 0:
                    column_names.append(name)
        print(f"Extracted column names: {column_names}")

        # Initialize dictionary for scraped data
        # Use a standard set of expected columns based on later processing steps
        expected_columns = [
            'Flight No.', 'Launch site', 'Payload', 'Payload mass', 'Orbit',
            'Customer', 'Launch outcome', 'Version Booster', 'Booster landing',
            'Date', 'Time'
        ]
        scraped_launch_dict = {name: [] for name in expected_columns}


        extracted_row = 0
        # Iterate through table rows, skipping the header row if necessary
        data_rows = first_launch_table.find_all("tr")
        if data_rows and data_rows[0].find_all('th'): # Check if first row is header
            data_rows = data_rows[1:] # Skip header row

        for rows in data_rows:
            # Check if the first element is a 'th' (like flight number) or 'td'
            first_cell = rows.find(['th', 'td'])
            if not first_cell: continue # Skip empty rows

            flight_number_text = first_cell.get_text(strip=True)
            flag = flight_number_text.isdigit()

            # Get all 'td' elements in the row
            row_cells = rows.find_all('td')

            # If it's a numbered row and has enough cells
            if flag and len(row_cells) >= 9:
                extracted_row += 1
                try:
                    # Flight Number value
                    scraped_launch_dict['Flight No.'].append(flight_number_text)

                    # Date and Time values
                    datatimelist = date_time(row_cells[0])
                    date = datatimelist[0].strip(',') if len(datatimelist) > 0 else None
                    time = datatimelist[1] if len(datatimelist) > 1 else None
                    scraped_launch_dict['Date'].append(date)
                    scraped_launch_dict['Time'].append(time)

                    # Booster version
                    bv = booster_version(row_cells[1])
                    if not bv and row_cells[1].find('a'): # Check if 'a' tag exists
                        bv = row_cells[1].find('a').string
                    scraped_launch_dict['Version Booster'].append(bv)

                    # Launch Site
                    launch_site = row_cells[2].find('a').string if row_cells[2].find('a') else None
                    scraped_launch_dict['Launch site'].append(launch_site)

                    # Payload
                    payload = row_cells[3].find('a').string if row_cells[3].find('a') else None
                    scraped_launch_dict['Payload'].append(payload)

                    # Payload Mass
                    payload_mass = get_mass(row_cells[4])
                    scraped_launch_dict['Payload mass'].append(payload_mass)

                    # Orbit
                    orbit = row_cells[5].find('a').string if row_cells[5].find('a') else None
                    scraped_launch_dict['Orbit'].append(orbit)

                    # Customer - Handle potential missing 'a' tag
                    customer_cell = row_cells[6]
                    customer_link = customer_cell.find('a')
                    customer = customer_link.string if customer_link else customer_cell.get_text(strip=True) # Fallback to cell text
                    scraped_launch_dict['Customer'].append(customer if customer else None)


                    # Launch outcome
                    launch_outcome = list(row_cells[7].strings)[0].strip() if list(row_cells[7].strings) else None
                    scraped_launch_dict['Launch outcome'].append(launch_outcome)

                    # Booster landing
                    booster_landing = landing_status(row_cells[8])
                    scraped_launch_dict['Booster landing'].append(booster_landing)

                except IndexError as ie:
                     print(f"Index error parsing row {extracted_row} (Flight No. {flight_number_text}): {ie}. Row cells: {len(row_cells)}")
                     # Append None to maintain structure if error occurs
                     for key in scraped_launch_dict:
                         if len(scraped_launch_dict[key]) < extracted_row:
                             scraped_launch_dict[key].append(None)
                except Exception as e:
                    print(f"General error parsing row {extracted_row} (Flight No. {flight_number_text}): {e}")
                    # Append None to maintain structure if error occurs
                    for key in scraped_launch_dict:
                        if len(scraped_launch_dict[key]) < extracted_row:
                            scraped_launch_dict[key].append(None)

        # Check if data was extracted
        if extracted_row > 0:
             # Ensure all lists have the same length before creating DataFrame
             max_len = max(len(lst) for lst in scraped_launch_dict.values())
             for key in scraped_launch_dict:
                 if len(scraped_launch_dict[key]) < max_len:
                     scraped_launch_dict[key].extend([None] * (max_len - len(scraped_launch_dict[key])))

             df_scraped = pd.DataFrame(scraped_launch_dict)
             print("Wikipedia Scraped Data Head:")
             print(df_scraped.head())
             # df_scraped.to_csv('spacex_web_scraped.csv', index=False) # Optional save
        else:
             print("No data extracted from Wikipedia table.")
             df_scraped = pd.DataFrame() # Create empty df if no data

    else:
        print("Target Wikipedia table not found.")
        df_scraped = pd.DataFrame() # Create empty df if no table

except requests.exceptions.RequestException as e:
    print(f"Failed to fetch Wikipedia page: {e}")
    df_scraped = pd.DataFrame() # Create empty df on failure
except Exception as e:
    print(f"An error occurred during Wikipedia scraping: {e}")
    df_scraped = pd.DataFrame() # Create empty df on failure

print("--- Finished Web Scraping ---")


# --- 2. API Data Collection ---
print("\n--- 2. Starting API Data Collection ---")
spacex_api_url = "https://api.spacexdata.com/v4/launches/past"
try:
    api_response = requests.get(spacex_api_url)
    api_response.raise_for_status()
    print(f"API request status: {api_response.status_code}")
    api_data_raw = api_response.json()
    api_data = pd.json_normalize(api_data_raw)

    # Basic cleaning as per notebook
    api_data = api_data[['rocket', 'payloads', 'launchpad', 'cores', 'flight_number', 'date_utc']]
    # Handle potential errors if 'cores' or 'payloads' are not lists or are empty
    api_data = api_data[api_data['cores'].apply(lambda x: isinstance(x, list) and len(x) == 1)]
    api_data = api_data[api_data['payloads'].apply(lambda x: isinstance(x, list) and len(x) == 1)]
    api_data['cores'] = api_data['cores'].map(lambda x: x[0] if x else None) # Extract first item safely
    api_data['payloads'] = api_data['payloads'].map(lambda x: x[0] if x else None) # Extract first item safely
    api_data['date'] = pd.to_datetime(api_data['date_utc']).dt.date
    # Filter date as in notebook - Make sure 'date' column exists and is correct type
    api_data = api_data[api_data['date'] <= datetime.date(2020, 11, 13)]

    # Initialize lists for API data
    BoosterVersion = []
    PayloadMass = []
    Orbit = []
    LaunchSite = []
    Outcome = []
    Flights = []
    GridFins = []
    Reused = []
    Legs = []
    LandingPad = []
    Block = []
    ReusedCount = []
    Serial = []
    Longitude = []
    Latitude = []

    # Call helper functions to populate lists
    getBoosterVersion(api_data, BoosterVersion)
    getLaunchSite(api_data, Longitude, Latitude, LaunchSite)
    getPayloadData(api_data, PayloadMass, Orbit)
    getCoreData(api_data, Outcome, Flights, GridFins, Reused, Legs, LandingPad, Block, ReusedCount, Serial)

    # Construct DataFrame from API lists
    api_launch_dict = {'FlightNumber': list(api_data['flight_number']),
                       'Date': list(api_data['date']),
                       'BoosterVersion': BoosterVersion,
                       'PayloadMass': PayloadMass,
                       'Orbit': Orbit,
                       'LaunchSite': LaunchSite,
                       'Outcome': Outcome,
                       'Flights': Flights,
                       'GridFins': GridFins,
                       'Reused': Reused,
                       'Legs': Legs,
                       'LandingPad': LandingPad,
                       'Block': Block,
                       'ReusedCount': ReusedCount,
                       'Serial': Serial,
                       'Longitude': Longitude,
                       'Latitude': Latitude}

    data_falcon9 = pd.DataFrame(api_launch_dict)
    print("API Data Head (before wrangling):")
    print(data_falcon9.head())

except requests.exceptions.RequestException as e:
    print(f"Failed to fetch data from SpaceX API: {e}")
    data_falcon9 = pd.DataFrame() # Create empty df on failure
except Exception as e:
    print(f"An error occurred during API data processing: {e}")
    data_falcon9 = pd.DataFrame() # Create empty df on failure

print("--- Finished API Data Collection ---")


# --- 3. Data Wrangling (using API data) ---
print("\n--- 3. Starting Data Wrangling ---")
if not data_falcon9.empty:
    # Task 3: Dealing with Missing Values
    print(f"Missing PayloadMass before imputation: {data_falcon9['PayloadMass'].isnull().sum()}")
    payload_mass_mean = data_falcon9['PayloadMass'].mean()
    data_falcon9['PayloadMass'].fillna(payload_mass_mean, inplace=True) # Use fillna instead of replace
    print(f"Missing PayloadMass after imputation: {data_falcon9['PayloadMass'].isnull().sum()}")

    # Task 4: Create Landing Outcome Label
    landing_outcomes = data_falcon9['Outcome'].value_counts()
    print("\nLanding Outcomes Counts:")
    print(landing_outcomes)
    # Define bad outcomes based on notebook logic
    bad_outcomes = {'None None', 'False Ocean', 'False ASDS', 'False RTLS', 'None ASDS'}
    landing_class = [0 if outcome in bad_outcomes else 1 for outcome in data_falcon9['Outcome']]
    data_falcon9['Class'] = landing_class
    print("\nDataFrame Head with Class label:")
    print(data_falcon9[['Outcome', 'Class']].head())
    print(f"\nSuccess Rate (Class mean): {data_falcon9['Class'].mean()}")
    # data_falcon9.to_csv('dataset_part_2.csv', index=False) # Optional save
else:
    print("Skipping Data Wrangling as API data loading failed.")

print("--- Finished Data Wrangling ---")


# --- 4. EDA and Visualization (using API data) ---
print("\n--- 4. Starting EDA and Visualization ---")
if not data_falcon9.empty:
    plt.figure(figsize=(12, 6))
    sns.catplot(y="PayloadMass", x="FlightNumber", hue="Class", data=data_falcon9, aspect=3, kind='strip') # Use strip plot
    plt.xlabel("Flight Number", fontsize=15)
    plt.ylabel("Payload Mass (kg)", fontsize=15)
    plt.title("Flight Number vs. Payload Mass (Colored by Landing Success)", fontsize=18)
    plt.savefig("eda_flight_payload.png") # Save plot
    plt.close() # Close plot to prevent display issues in some environments
    print("Saved plot: eda_flight_payload.png")

    plt.figure(figsize=(12, 6))
    sns.catplot(x='FlightNumber', y='LaunchSite', hue='Class', data=data_falcon9, aspect=3, kind='strip')
    plt.xlabel('Flight Number', fontsize=15)
    plt.ylabel('Launch Site', fontsize=15)
    plt.title("Flight Number vs. Launch Site (Colored by Landing Success)", fontsize=18)
    plt.savefig("eda_flight_launchsite.png")
    plt.close()
    print("Saved plot: eda_flight_launchsite.png")

    plt.figure(figsize=(12, 6))
    sns.catplot(x='PayloadMass', y='LaunchSite', hue='Class', data=data_falcon9, aspect=3, kind='strip')
    plt.xlabel('Payload Mass (kg)', fontsize=15)
    plt.ylabel('Launch Site', fontsize=15)
    plt.title("Payload Mass vs. Launch Site (Colored by Landing Success)", fontsize=18)
    plt.savefig("eda_payload_launchsite.png")
    plt.close()
    print("Saved plot: eda_payload_launchsite.png")

    # Success rate by Orbit
    plt.figure(figsize=(10, 6))
    orbit_success_rate = data_falcon9.groupby('Orbit')['Class'].mean().reset_index()
    sns.barplot(x='Orbit', y='Class', data=orbit_success_rate)
    plt.xlabel('Orbit Type', fontsize=15)
    plt.ylabel('Success Rate', fontsize=15)
    plt.title('Launch Success Rate by Orbit Type', fontsize=18)
    plt.xticks(rotation=45, ha='right') # Rotate labels if needed
    plt.tight_layout()
    plt.savefig("eda_orbit_success_rate.png")
    plt.close()
    print("Saved plot: eda_orbit_success_rate.png")

    # Flight Number vs Orbit
    plt.figure(figsize=(12, 6))
    sns.catplot(x='FlightNumber', y='Orbit', hue='Class', data=data_falcon9, aspect=3, kind='strip')
    plt.xlabel('Flight Number', fontsize=15)
    plt.ylabel('Orbit', fontsize=15)
    plt.title('Flight Number vs. Orbit (Colored by Landing Success)', fontsize=18)
    plt.savefig("eda_flight_orbit.png")
    plt.close()
    print("Saved plot: eda_flight_orbit.png")

    # Payload vs Orbit
    plt.figure(figsize=(12, 6))
    sns.catplot(x='PayloadMass', y='Orbit', hue='Class', data=data_falcon9, aspect=3, kind='strip')
    plt.xlabel('Payload Mass (kg)', fontsize=15)
    plt.ylabel('Orbit', fontsize=15)
    plt.title('Payload Mass vs. Orbit (Colored by Landing Success)', fontsize=18)
    plt.savefig("eda_payload_orbit.png")
    plt.close()
    print("Saved plot: eda_payload_orbit.png")

    # Yearly success rate trend
    # Extract year (ensure Date column is datetime or string YYYY-MM-DD)
    try:
        # Attempt conversion if not already datetime
        if not pd.api.types.is_datetime64_any_dtype(data_falcon9['Date']):
             data_falcon9['Date'] = pd.to_datetime(data_falcon9['Date'])
        data_falcon9['Year'] = data_falcon9['Date'].dt.year
        yearly_success = data_falcon9.groupby('Year')['Class'].mean().reset_index()

        plt.figure(figsize=(10, 6))
        sns.lineplot(x='Year', y='Class', data=yearly_success, marker='o') # Added marker
        plt.xlabel('Year', fontsize=15)
        plt.ylabel('Average Success Rate', fontsize=15)
        plt.title('Launch Success Rate Trend Over Years', fontsize=18)
        plt.grid(True)
        plt.ylim(0, 1.05) # Set y-axis limits
        plt.xticks(yearly_success['Year'].unique()) # Ensure all years are shown
        plt.tight_layout()
        plt.savefig("eda_yearly_success_trend.png")
        plt.close()
        print("Saved plot: eda_yearly_success_trend.png")
    except Exception as e:
        print(f"Could not plot yearly trend: {e}")

else:
    print("Skipping EDA and Visualization as data loading failed.")

print("--- Finished EDA and Visualization ---")


# --- 5. Feature Engineering (One-Hot Encoding) ---
print("\n--- 5. Starting Feature Engineering ---")
if not data_falcon9.empty:
    # Select features for modeling, excluding Date, Outcome, BoosterVersion, Year
    features = data_falcon9[['FlightNumber', 'PayloadMass', 'Orbit', 'LaunchSite', 'Flights', 'GridFins', 'Reused', 'Legs', 'LandingPad', 'Block', 'ReusedCount', 'Serial']]
    categorical_cols = ['Orbit', 'LaunchSite', 'LandingPad', 'Serial', 'GridFins', 'Reused', 'Legs'] # Include boolean cols for get_dummies

    # Convert boolean columns to string before get_dummies if they exist
    for col in ['GridFins', 'Reused', 'Legs']:
        if col in features.columns:
            features[col] = features[col].astype(str)

    features_one_hot = pd.get_dummies(features, columns=categorical_cols)
    print("Feature Engineering Head (One-Hot Encoded):")
    print(features_one_hot.head())
    # features_one_hot.to_csv('dataset_part_3.csv', index=False) # Optional save
else:
    print("Skipping Feature Engineering as data loading failed.")
    features_one_hot = pd.DataFrame() # Define as empty for ML section check

print("--- Finished Feature Engineering ---")


# --- 6. Machine Learning Prediction ---
print("\n--- 6. Starting Machine Learning ---")
if not features_one_hot.empty and 'Class' in data_falcon9.columns:
    # Task 1: Create Y array
    Y = data_falcon9['Class'].to_numpy()

    # Task 2: Standardize X
    X = features_one_hot.astype(float) # Ensure float type
    transform = preprocessing.StandardScaler()
    X = transform.fit_transform(X)
    print(f"Features shape after standardization: {X.shape}")

    # Task 3: Train/Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)
    print(f"Train shapes: X={X_train.shape}, Y={Y_train.shape}")
    print(f"Test shapes: X={X_test.shape}, Y={Y_test.shape}")

    # --- Model Training and Evaluation ---
    models = {}
    best_scores = {}
    test_accuracies = {}
    predictions = {}

    # Task 4 & 5: Logistic Regression
    print("\nTraining Logistic Regression...")
    parameters_lr = {'C': [0.01, 0.1, 1], 'penalty': ['l2'], 'solver': ['lbfgs']}
    lr = LogisticRegression(random_state=1) # Added random_state for reproducibility
    logreg_cv = GridSearchCV(lr, parameters_lr, cv=10)
    logreg_cv.fit(X_train, Y_train)
    models['LogReg'] = logreg_cv
    best_scores['LogReg'] = logreg_cv.best_score_
    test_accuracies['LogReg'] = logreg_cv.score(X_test, Y_test)
    predictions['LogReg'] = logreg_cv.predict(X_test)
    print(f"LogReg Best Params: {logreg_cv.best_params_}")
    print(f"LogReg CV Accuracy: {best_scores['LogReg']:.4f}")
    print(f"LogReg Test Accuracy: {test_accuracies['LogReg']:.4f}")
    plot_confusion_matrix(Y_test, predictions['LogReg'], title='LogReg Confusion Matrix')
    plt.savefig("cm_logreg.png")
    plt.close()
    print("Saved plot: cm_logreg.png")


    # Task 6 & 7: Support Vector Machine (SVM)
    print("\nTraining SVM...")
    parameters_svm = {'kernel': ('linear', 'rbf', 'poly', 'sigmoid'), # Removed duplicate 'rbf'
                      'C': np.logspace(-3, 3, 5),
                      'gamma': np.logspace(-3, 3, 5)}
    svm = SVC(random_state=1) # Added random_state
    svm_cv = GridSearchCV(svm, parameters_svm, cv=10)
    svm_cv.fit(X_train, Y_train)
    models['SVM'] = svm_cv
    best_scores['SVM'] = svm_cv.best_score_
    test_accuracies['SVM'] = svm_cv.score(X_test, Y_test)
    predictions['SVM'] = svm_cv.predict(X_test)
    print(f"SVM Best Params: {svm_cv.best_params_}")
    print(f"SVM CV Accuracy: {best_scores['SVM']:.4f}")
    print(f"SVM Test Accuracy: {test_accuracies['SVM']:.4f}")
    plot_confusion_matrix(Y_test, predictions['SVM'], title='SVM Confusion Matrix')
    plt.savefig("cm_svm.png")
    plt.close()
    print("Saved plot: cm_svm.png")

    # Task 8 & 9: Decision Tree
    print("\nTraining Decision Tree...")
    parameters_tree = {'criterion': ['gini', 'entropy'],
                       'splitter': ['best', 'random'],
                       'max_depth': [2 * n for n in range(1, 10)],
                       'max_features': ['auto', 'sqrt'], # 'auto' is deprecated, often equivalent to 'sqrt' or None
                       'min_samples_leaf': [1, 2, 4],
                       'min_samples_split': [2, 5, 10]}
    tree = DecisionTreeClassifier(random_state=1) # Added random_state
    tree_cv = GridSearchCV(tree, parameters_tree, cv=10)
    tree_cv.fit(X_train, Y_train)
    models['Tree'] = tree_cv
    best_scores['Tree'] = tree_cv.best_score_
    test_accuracies['Tree'] = tree_cv.score(X_test, Y_test)
    predictions['Tree'] = tree_cv.predict(X_test)
    print(f"Tree Best Params: {tree_cv.best_params_}")
    print(f"Tree CV Accuracy: {best_scores['Tree']:.4f}")
    print(f"Tree Test Accuracy: {test_accuracies['Tree']:.4f}")
    plot_confusion_matrix(Y_test, predictions['Tree'], title='Decision Tree Confusion Matrix')
    plt.savefig("cm_tree.png")
    plt.close()
    print("Saved plot: cm_tree.png")

    # Task 10 & 11: K-Nearest Neighbors (KNN)
    print("\nTraining KNN...")
    parameters_knn = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                      'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                      'p': [1, 2]}
    knn = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn, parameters_knn, cv=10)
    knn_cv.fit(X_train, Y_train)
    models['KNN'] = knn_cv
    best_scores['KNN'] = knn_cv.best_score_
    test_accuracies['KNN'] = knn_cv.score(X_test, Y_test)
    predictions['KNN'] = knn_cv.predict(X_test)
    print(f"KNN Best Params: {knn_cv.best_params_}")
    print(f"KNN CV Accuracy: {best_scores['KNN']:.4f}")
    print(f"KNN Test Accuracy: {test_accuracies['KNN']:.4f}")
    plot_confusion_matrix(Y_test, predictions['KNN'], title='KNN Confusion Matrix')
    plt.savefig("cm_knn.png")
    plt.close()
    print("Saved plot: cm_knn.png")

    # Task 12: Find Best Method
    print("\n--- Model Comparison (Test Set) ---")
    # Use jaccard_similarity_score for older sklearn versions
    jaccard_scores_test = [jaccard_similarity_score(Y_test, yhat) for yhat in predictions.values()]
    f1_scores_test = [f1_score(Y_test, yhat, average='binary') for yhat in predictions.values()]
    accuracy_test = list(test_accuracies.values())
    model_names = list(models.keys())

    scores_test_df = pd.DataFrame(
        np.array([jaccard_scores_test, f1_scores_test, accuracy_test]),
        index=['Jaccard_Score', 'F1_Score', 'Accuracy'],
        columns=model_names
    )
    print("\nTest Set Scores:")
    print(scores_test_df)

    # Compare based on F1-score (often good for imbalanced classes) or Accuracy
    best_f1_model = model_names[np.argmax(f1_scores_test)]
    best_acc_model = model_names[np.argmax(accuracy_test)]
    print(f"\nBest model based on Test F1-Score: {best_f1_model} ({max(f1_scores_test):.4f})")
    print(f"Best model based on Test Accuracy: {best_acc_model} ({max(accuracy_test):.4f})")

    # Also show scores on the whole dataset as done in the notebook
    print("\n--- Model Comparison (Whole Dataset) ---")
    # Use jaccard_similarity_score for older sklearn versions
    jaccard_scores_all = [jaccard_similarity_score(Y, cv.predict(X)) for cv in models.values()]
    f1_scores_all = [f1_score(Y, cv.predict(X), average='binary') for cv in models.values()]
    accuracy_all = [cv.score(X, Y) for cv in models.values()]

    scores_all_df = pd.DataFrame(
        np.array([jaccard_scores_all, f1_scores_all, accuracy_all]),
        index=['Jaccard_Score', 'F1_Score', 'Accuracy'],
        columns=model_names
    )
    print("\nWhole Dataset Scores:")
    print(scores_all_df)
    best_f1_model_all = model_names[np.argmax(f1_scores_all)]
    best_acc_model_all = model_names[np.argmax(accuracy_all)]
    print(f"\nBest model based on Whole Dataset F1-Score: {best_f1_model_all} ({max(f1_scores_all):.4f})")
    print(f"Best model based on Whole Dataset Accuracy: {best_acc_model_all} ({max(accuracy_all):.4f})")

else:
    print("Skipping Machine Learning as data preparation failed.")

print("--- Finished Machine Learning ---")


# --- 7. Folium Map Visualization ---
print("\n--- 7. Starting Folium Map Visualization ---")
# Use the data_falcon9 DataFrame created earlier which includes Lat, Long, LaunchSite, Class
if not data_falcon9.empty and all(col in data_falcon9.columns for col in ['Lat', 'Long', 'LaunchSite', 'Class']):
    # Get unique launch sites coordinates
    # Drop rows with NaN Lat/Long before grouping
    launch_sites_df_valid = data_falcon9.dropna(subset=['Lat', 'Long'])
    launch_sites_df = launch_sites_df_valid.groupby(['LaunchSite'], as_index=False).first()[['LaunchSite', 'Lat', 'Long']]

    # Initial map centered roughly around US launch sites
    if not launch_sites_df.empty:
        center_lat = launch_sites_df['Lat'].mean()
        center_long = launch_sites_df['Long'].mean()
    else:
        center_lat = 29.55 # Default if no sites found
        center_long = -95.08

    site_map = folium.Map(location=[center_lat, center_long], zoom_start=4)

    # Add circles and labels for each launch site
    for launch_site, site_lat, site_long in zip(launch_sites_df['LaunchSite'], launch_sites_df['Lat'], launch_sites_df['Long']):
        site_coordinate = [site_lat, site_long]
        circle = folium.Circle(site_coordinate, radius=1000, color='#d35400', fill=True).add_child(folium.Popup(launch_site))
        marker = folium.map.Marker(
            site_coordinate,
            icon=DivIcon(
                icon_size=(20, 20),
                icon_anchor=(0, 0),
                html='<div style="font-size: 12; color:#d35400;"><b>%s</b></div>' % launch_site,
            )
        )
        site_map.add_child(circle)
        site_map.add_child(marker)

    # Add success/fail markers with clustering
    marker_cluster = MarkerCluster().add_to(site_map)
    # Ensure marker_color exists, handle potential NaNs in Lat/Long for iteration
    if 'marker_color' not in data_falcon9.columns:
         data_falcon9['marker_color'] = data_falcon9['Class'].apply(lambda x: 'green' if x == 1 else 'red')

    for index, record in data_falcon9.dropna(subset=['Lat', 'Long']).iterrows(): # Iterate over valid rows
        site_coordinate = [record['Lat'], record['Long']]
        marker = folium.Marker(
            location=site_coordinate,
            icon=folium.Icon(color='white', icon_color=record['marker_color']),
            popup=record['LaunchSite'] # Add popup for site name
        )
        marker_cluster.add_child(marker)

    # Add Mouse Position control
    formatter = "function(num) {return L.Util.formatNum(num, 5);};"
    mouse_position = MousePosition(
        position='topright',
        separator=' Long: ',
        empty_string='NaN',
        lng_first=False,
        num_digits=20,
        prefix='Lat:',
        lat_formatter=formatter,
        lng_formatter=formatter,
    )
    site_map.add_child(mouse_position)

    # Add distance lines (example for KSC LC-39A)
    try:
        ksc_row = launch_sites_df[launch_sites_df['LaunchSite'] == 'KSC LC-39A']
        if not ksc_row.empty:
            ksc_coords = ksc_row[['Lat', 'Long']].iloc[0].tolist()
            # Define some points of interest near KSC (approximate coordinates)
            coastline_ksc = [28.573, -80.568] # Approx East coastline
            railway_ksc = [28.573, -80.800] # Approx railway line west of KSC
            highway_ksc = [28.573, -80.853] # Approx US-1
            city_ksc = [28.612, -80.808] # Titusville

            points_of_interest = {
                "Coastline": coastline_ksc,
                "Railway": railway_ksc,
                "Highway": highway_ksc,
                "City (Titusville)": city_ksc
            }
            colors = ['blue', 'green', 'purple', 'orange']
            html_colors = ['#007bff','#28a745','#6f42c1','#fd7e14']

            for i, (name, coords) in enumerate(points_of_interest.items()):
                distance = calculate_distance(ksc_coords[0], ksc_coords[1], coords[0], coords[1])
                # Distance Marker
                folium.Marker(
                    coords,
                    icon=DivIcon(
                        icon_size=(150,30), # Adjusted size
                        icon_anchor=(0,0),
                        html=f'<div style="font-size: 12px; color:{html_colors[i]};"><b>{name} ({distance:.2f} km)</b></div>',
                    )
                ).add_to(site_map)
                # Distance Line
                folium.PolyLine([ksc_coords, coords], color=colors[i]).add_to(site_map)
        else:
            print("Could not find KSC LC-39A coordinates to draw distance lines.")

    except Exception as e:
        print(f"Error adding distance lines: {e}")

    # Save map to HTML
    site_map.save("launch_sites_analysis_map.html")
    print("Saved Folium map to launch_sites_analysis_map.html")

else:
    print("Skipping Folium Map Visualization as data loading/wrangling failed.")

print("--- Finished Folium Map Visualization ---")

print("\n--- Project Script Finished ---")
# Note: The Dash application code (spacex_app.py) needs to be run separately as a web server.

--- 1. Starting Web Scraping ---
Wikipedia page title: List of Falcon 9 and Falcon Heavy launches - Wikipedia
Target Wikipedia table not found.
--- Finished Web Scraping ---

--- 2. Starting API Data Collection ---
API request status: 200
API Data Head (before wrangling):
   FlightNumber        Date BoosterVersion  PayloadMass Orbit  \
0             1  2006-03-24       Falcon 1         20.0   LEO   
1             2  2007-03-21       Falcon 1          NaN   LEO   
2             4  2008-09-28       Falcon 1        165.0   LEO   
3             5  2009-07-13       Falcon 1        200.0   LEO   
4             6  2010-06-04       Falcon 9          NaN   LEO   

        LaunchSite    Outcome  Flights  GridFins  Reused   Legs LandingPad  \
0  Kwajalein Atoll  None None        1     False   False  False       None   
1  Kwajalein Atoll  None None        1     False   False  False       None   
2  Kwajalein Atoll  None None        1     False   False  False       None   
3  Kwajalein Atoll  None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_folds = np.zeros(n_samples, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Depre

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>