In [1]:
# Import dependencies:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.metrics import balanced_accuracy_score
#from imblearn.metrics import classification_report_imbalanced
#from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# Load the data:
file_path = Path("../data/Cleaned_NASA_Exoplanets.csv")
exoplanet_df = pd.read_csv(file_path)

# Drop the null columns where all values are null:
#exoplanet_df = exoplanet_df.dropna(axis='columns', how='all')

# Drop the null rows:
#exoplanet_df = exoplanet_df.dropna()

exoplanet_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,rowid,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,default_flag,sy_snum,...,sy_kepmagerr2,rowupdate,pl_pubdate,releasedate,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec
0,1,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,1,2,...,,5/14/2014,2008-01,5/14/2014,2,1,2,0,0,0
1,2,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,0,2,...,,7/23/2014,2011-08,7/23/2014,2,1,2,0,0,0
2,3,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,1,1,...,,9/4/2018,2017-03,9/6/2018,0,1,1,0,0,0
3,4,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,0,1,...,,4/25/2018,2009-10,5/14/2014,0,1,1,0,0,0
4,5,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,0,1,...,,4/25/2018,2011-08,7/23/2014,0,1,1,0,0,0


In [3]:
# Count distinct values in "soltype" column:
exoplanet_df['soltype'].value_counts()

Published Confirmed                               16151
Kepler Project Candidate (q1_q17_dr25_sup_koi)     2669
Kepler Project Candidate (q1_q16_koi)              2661
Kepler Project Candidate (q1_q17_dr25_koi)         2652
Kepler Project Candidate (q1_q17_dr24_koi)         2641
Kepler Project Candidate (q1_q12_koi)              2625
Kepler Project Candidate (q1_q8_koi)               2272
Published Candidate                                 771
TESS Project Candidate                              184
Name: soltype, dtype: int64

In [4]:
# Create a new dataframe by selecting desired columns:
new_exoplanet_df = exoplanet_df[["sy_snum", 
                                 "sy_pnum", 
                                 "pl_orbper", "pl_rade", "soltype", "pl_eqt", 
                                 "st_teff", "st_rad", "st_mass", "st_logg"]]


print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(32626, 10)


Unnamed: 0,sy_snum,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
0,2,1,326.03,,Published Confirmed,,4742.0,19.0,2.7,2.31
1,2,1,,,Published Confirmed,,,,2.6,
2,1,1,516.21997,,Published Confirmed,,4213.0,29.79,2.78,1.93
3,1,1,516.22,,Published Confirmed,,4340.0,24.08,1.8,1.6
4,1,1,,,Published Confirmed,,,,1.7,


In [5]:
# Dropna from new dataframe:
new_exoplanet_df= new_exoplanet_df.dropna()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(14132, 10)


Unnamed: 0,sy_snum,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
100,2,5,0.7365437,2.08,Published Confirmed,1958.0,5234.0,0.94,0.91,4.45
161,1,1,41.6855,2.23,Published Confirmed,546.0,5766.0,0.93,0.96,4.5
190,1,1,8040000.0,20.849,Published Confirmed,1800.0,2320.0,0.24,0.01,3.85
193,1,1,402000000.0,12.442,Published Confirmed,434.0,3406.0,0.39,0.37,4.83
196,1,1,1.508969,17.385,Published Confirmed,1915.0,5950.0,1.13,0.95,4.31


In [6]:
# Drop duplicates:
new_exoplanet_df = new_exoplanet_df.drop_duplicates()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(14131, 10)


Unnamed: 0,sy_snum,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
100,2,5,0.7365437,2.08,Published Confirmed,1958.0,5234.0,0.94,0.91,4.45
161,1,1,41.6855,2.23,Published Confirmed,546.0,5766.0,0.93,0.96,4.5
190,1,1,8040000.0,20.849,Published Confirmed,1800.0,2320.0,0.24,0.01,3.85
193,1,1,402000000.0,12.442,Published Confirmed,434.0,3406.0,0.39,0.37,4.83
196,1,1,1.508969,17.385,Published Confirmed,1915.0,5950.0,1.13,0.95,4.31


In [7]:
# Count distinct values in "soltype" column
new_exoplanet_df['soltype'].value_counts()

Kepler Project Candidate (q1_q16_koi)         2659
Kepler Project Candidate (q1_q17_dr25_koi)    2652
Kepler Project Candidate (q1_q17_dr24_koi)    2640
Kepler Project Candidate (q1_q12_koi)         2623
Kepler Project Candidate (q1_q8_koi)          2272
Published Confirmed                           1283
Published Candidate                              2
Name: soltype, dtype: int64

In [8]:
# Drop planets with more than 1 star:
new_exoplanet_df.drop(new_exoplanet_df.index[new_exoplanet_df['sy_snum'] > 1], inplace=True)
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(13434, 10)


Unnamed: 0,sy_snum,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
161,1,1,41.6855,2.23,Published Confirmed,546.0,5766.0,0.93,0.96,4.5
190,1,1,8040000.0,20.849,Published Confirmed,1800.0,2320.0,0.24,0.01,3.85
193,1,1,402000000.0,12.442,Published Confirmed,434.0,3406.0,0.39,0.37,4.83
196,1,1,1.508969,17.385,Published Confirmed,1915.0,5950.0,1.13,0.95,4.31
197,1,1,1.508956,16.7,Published Confirmed,1898.0,5950.0,1.11,0.95,4.25


In [9]:
# Drop the "sy_snum" column as it now only has 1 unique value:
clean_exoplanet_df = new_exoplanet_df.drop(columns=['sy_snum'])
print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

(13434, 9)


Unnamed: 0,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
161,1,41.6855,2.23,Published Confirmed,546.0,5766.0,0.93,0.96,4.5
190,1,8040000.0,20.849,Published Confirmed,1800.0,2320.0,0.24,0.01,3.85
193,1,402000000.0,12.442,Published Confirmed,434.0,3406.0,0.39,0.37,4.83
196,1,1.508969,17.385,Published Confirmed,1915.0,5950.0,1.13,0.95,4.31
197,1,1.508956,16.7,Published Confirmed,1898.0,5950.0,1.11,0.95,4.25


In [10]:
#
clean_exoplanet_df['soltype'].value_counts()

Kepler Project Candidate (q1_q16_koi)         2559
Kepler Project Candidate (q1_q17_dr25_koi)    2552
Kepler Project Candidate (q1_q17_dr24_koi)    2542
Kepler Project Candidate (q1_q12_koi)         2523
Kepler Project Candidate (q1_q8_koi)          2183
Published Confirmed                           1073
Published Candidate                              2
Name: soltype, dtype: int64

In [11]:
# Replace values in 'soltype' with 1 or 0:

clean_exoplanet_df['soltype'].replace('Published Confirmed', 1, inplace=True)
clean_exoplanet_df[clean_exoplanet_df['soltype'] != 1] = 0 
clean_exoplanet_df

Unnamed: 0,sy_pnum,pl_orbper,pl_rade,soltype,pl_eqt,st_teff,st_rad,st_mass,st_logg
161,1,4.168550e+01,2.230,1,546.0,5766.0,0.93,0.96,4.50
190,1,8.040000e+06,20.849,1,1800.0,2320.0,0.24,0.01,3.85
193,1,4.020000e+08,12.442,1,434.0,3406.0,0.39,0.37,4.83
196,1,1.508969e+00,17.385,1,1915.0,5950.0,1.13,0.95,4.31
197,1,1.508956e+00,16.700,1,1898.0,5950.0,1.11,0.95,4.25
...,...,...,...,...,...,...,...,...,...
32510,1,4.187757e+00,12.207,1,1203.0,5370.0,1.06,0.91,4.34
32512,1,3.765001e+00,23.203,1,1577.0,6720.0,1.93,1.47,4.04
32514,1,2.864142e+00,15.390,1,1743.0,6250.0,1.48,1.41,4.25
32582,3,6.267900e+00,2.042,1,1170.0,6037.0,1.10,1.09,4.42


In [12]:
clean_exoplanet_df['soltype'].value_counts()

0    12361
1     1073
Name: soltype, dtype: int64

In [13]:
clean_exoplanet_df['soltype'] = clean_exoplanet_df['soltype'].astype(str).astype(int)

In [14]:
clean_exoplanet_df.columns = ['Number of Planets', 
                              'Planet Orbit', 'Planet Radius', 'Planet Status', 'Equalibrium Temp',
                              'Star Temp', 'Star Radius', 'Star Mass', 'Star Gravity']

clean_exoplanet_df.head()

Unnamed: 0,Number of Planets,Planet Orbit,Planet Radius,Planet Status,Equalibrium Temp,Star Temp,Star Radius,Star Mass,Star Gravity
161,1,41.6855,2.23,1,546.0,5766.0,0.93,0.96,4.5
190,1,8040000.0,20.849,1,1800.0,2320.0,0.24,0.01,3.85
193,1,402000000.0,12.442,1,434.0,3406.0,0.39,0.37,4.83
196,1,1.508969,17.385,1,1915.0,5950.0,1.13,0.95,4.31
197,1,1.508956,16.7,1,1898.0,5950.0,1.11,0.95,4.25


In [15]:
# Export the Dataframe as a new CSV file without the index.
#clean_exoplanet_df.to_csv("clean_exoplanet_df.csv", index=False)
clean_exoplanet_df.dtypes

Number of Planets      int64
Planet Orbit         float64
Planet Radius        float64
Planet Status          int32
Equalibrium Temp     float64
Star Temp            float64
Star Radius          float64
Star Mass            float64
Star Gravity         float64
dtype: object

# Mockup/ Draft Model
# Split the Data into Training and Testing

### Create a Supervised Machine Learning model, use Logistic Regression // classification 1 or 0.

In [16]:
# Create our features:
X = clean_exoplanet_df.drop("Planet Status", axis=1)

# Create our target:
y = clean_exoplanet_df["Planet Status"]

In [17]:
# Split into Train and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=.33)

In [18]:
# Train the model:
lm = LogisticRegression()
lm.fit(X_train, y_train)

LogisticRegression()

In [19]:
# Validate the model:
from sklearn.metrics import classification_report

predict = lm.predict(X_test)
predict

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4053
           1       1.00      1.00      1.00       381

    accuracy                           1.00      4434
   macro avg       1.00      1.00      1.00      4434
weighted avg       1.00      1.00      1.00      4434

