## Step 1: Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

## Step 2: Load the data (replace the file path with your local file path)

In [2]:
df = pd.read_csv('SpaceX_Falcon9.csv')

## Step 3: Data Preprocessing

In [3]:
# Convert 'Date' to datetime (if required)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Optionally, create a new column for the number of days since the first launch
df['DaysSinceLaunch'] = (df['Date'] - df['Date'].min()).dt.days

# Drop the 'Date' column after converting it to a numeric feature
df_encoded = df.drop('Date', axis=1)

# One-hot encode categorical columns (e.g., 'Orbit', 'LaunchSite', etc.)
categorical_columns = ['Orbit', 'LaunchSite', 'BoosterVersion', 'LandingPad', 'Serial']
df_encoded = pd.get_dummies(df_encoded, columns=categorical_columns, drop_first=True)

## Step 4: Check the target variable distribution

In [4]:
y = df_encoded['Outcome']  # Target
print("Original target distribution:")
print(y.value_counts())


Original target distribution:
Outcome
True ASDS      41
None None      19
True RTLS      14
False ASDS      6
True Ocean      5
False Ocean     2
None ASDS       2
False RTLS      1
Name: count, dtype: int64


## Step 5: Define features and target

In [5]:
X = df_encoded.drop('Outcome', axis=1)  # Features
y = df_encoded['Outcome']  # Target

## Step 6: Check if there are both classes in the target variable

In [6]:
if len(y.value_counts()) == 1:
    print("The dataset contains only one class. Handling imbalance using SMOTE...")
    # If only one class, apply SMOTE to balance the dataset
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    print("Resampled target class distribution:")
    print(pd.Series(y_res).value_counts())
else:
    print("The dataset contains both classes. Proceeding without resampling.")
    # If both classes exist, continue without SMOTE
    X_res, y_res = X, y



The dataset contains both classes. Proceeding without resampling.


## Step 7: Impute missing values for numerical columns

In [7]:
imputer = SimpleImputer(strategy='median')  # Impute missing values using the median (for numerical columns)
X_imputed = imputer.fit_transform(X_res)

# Impute missing values for target (if necessary)
y_imputed = SimpleImputer(strategy='most_frequent').fit_transform(y_res.values.reshape(-1, 1))

## Step 8: Split data into training and testing sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_imputed, test_size=0.2, random_state=42)


## Step 9: Train models on resampled data (if SMOTE was applied)


In [9]:

# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)



  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return fit_method(estimator, *args, **kwargs)


## Step 10: Evaluate models


In [10]:
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))

print("Decision Tree:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Logistic Regression:
Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

  False ASDS       0.00      0.00      0.00         1
 False Ocean       0.00      0.00      0.00         1
   None None       0.50      1.00      0.67         2
   True ASDS       0.64      1.00      0.78         7
  True Ocean       0.00      0.00      0.00         2
   True RTLS       1.00      0.60      0.75         5

    accuracy                           0.67        18
   macro avg       0.36      0.43      0.37        18
weighted avg       0.58      0.67      0.58        18

Decision Tree:
Accuracy: 0.5555555555555556
              precision    recall  f1-score   support

  False ASDS       0.00      0.00      0.00         1
 False Ocean       0.00      0.00      0.00         1
   None None       0.67      1.00      0.80         2
   True ASDS       0.55      0.86      0.67         7
  True Ocean       0.00      0.00      0.00         2
   True RTLS       1.00      0.40     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
