In [11]:
# Initial imports:

import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [12]:
# Load the exoplanets.csv dataset:

file_path = "exoplanets.csv"
exoplanet_df = pd.read_csv(file_path)
print(exoplanet_df.shape)
exoplanet_df.head()

(5063, 313)


Unnamed: 0,rowid,pl_name,hostname,pl_letter,hd_name,hip_name,tic_id,gaia_id,sy_snum,sy_pnum,...,sy_kepmag,sy_kepmagerr1,sy_kepmagerr2,sy_kepmag_reflink,pl_nnotes,st_nphot,st_nrvc,st_nspec,pl_nespec,pl_ntranspec
0,1,11 Com b,11 Com,b,HD 107383,HIP 60202,TIC 72437047,Gaia DR2 3946945413106333696,2,1,...,,,,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,2.0,1,2,0,0,0
1,2,11 UMi b,11 UMi,b,HD 136726,HIP 74793,TIC 230061010,Gaia DR2 1696798367260229376,1,1,...,,,,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,0.0,1,1,0,0,0
2,3,14 And b,14 And,b,HD 221345,HIP 116076,TIC 333225860,Gaia DR2 1920113512486282240,1,1,...,,,,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,0.0,1,1,0,0,0
3,4,14 Her b,14 Her,b,HD 145675,HIP 79248,TIC 219483057,Gaia DR2 1385293808145621504,1,2,...,,,,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,0.0,1,4,1,0,0
4,5,16 Cyg B b,16 Cyg B,b,HD 186427,HIP 96901,TIC 27533327,Gaia DR2 2135550755683407232,3,1,...,6.095,,,<a refstr=STASSUN_ET_AL__2019 href=https://ui....,,1,4,3,0,0


In [13]:
# Create a new dataframe by selecting desired columns:

new_exoplanet_df = exoplanet_df[["pl_name", "hostname", "pl_letter", "sy_snum", 
                                 "sy_mnum", "discoverymethod", "disc_instrument", 
                                 "pl_orbper", "pl_rade", "st_spectype", "st_teff", 
                                 "st_rad", "st_mass", "st_logg", "st_age"]]

### Might be useful to focus more on info of stars? 
### As we are trying to determine which systems are more likely to have planets.
### Could also determine the most common type of Planet discovered, 
### create bin/groups of radius/density/orbit time/loaction of orbit?

# dropna() with "st_rotp" left less than 300 planets, without it just over 1000
# without "st_dens" 1136
# without "st_vsin" 1336

print(new_exoplanet_df.shape)

new_exoplanet_df.head()

(5063, 15)


Unnamed: 0,pl_name,hostname,pl_letter,sy_snum,sy_mnum,discoverymethod,disc_instrument,pl_orbper,pl_rade,st_spectype,st_teff,st_rad,st_mass,st_logg,st_age
0,11 Com b,11 Com,b,2,0,Radial Velocity,Coude Echelle Spectrograph,326.03,12.1,G8 III,4742.0,19.0,2.7,2.31,
1,11 UMi b,11 UMi,b,1,0,Radial Velocity,Coude Echelle Spectrograph,516.21997,12.3,K4 III,4213.0,29.79,2.78,1.93,1.56
2,14 And b,14 And,b,1,0,Radial Velocity,HIDES Echelle Spectrograph,185.84,12.9,K0 III,4813.0,11.0,2.2,2.63,4.5
3,14 Her b,14 Her,b,1,0,Radial Velocity,HIRES Spectrometer,1773.40002,12.9,K0 V,5338.0,0.93,0.9,4.45,3.9
4,16 Cyg B b,16 Cyg B,b,3,0,Radial Velocity,Multiple Instruments,798.5,13.5,G3 V,5750.0,1.13,1.08,4.36,7.4


In [14]:
# Dropna from new dataframe:

new_exoplanet_df= new_exoplanet_df.dropna()

print(new_exoplanet_df.shape)

new_exoplanet_df.head()

(1336, 15)


Unnamed: 0,pl_name,hostname,pl_letter,sy_snum,sy_mnum,discoverymethod,disc_instrument,pl_orbper,pl_rade,st_spectype,st_teff,st_rad,st_mass,st_logg,st_age
1,11 UMi b,11 UMi,b,1,0,Radial Velocity,Coude Echelle Spectrograph,516.21997,12.3,K4 III,4213.0,29.79,2.78,1.93,1.56
2,14 And b,14 And,b,1,0,Radial Velocity,HIDES Echelle Spectrograph,185.84,12.9,K0 III,4813.0,11.0,2.2,2.63,4.5
3,14 Her b,14 Her,b,1,0,Radial Velocity,HIRES Spectrometer,1773.40002,12.9,K0 V,5338.0,0.93,0.9,4.45,3.9
4,16 Cyg B b,16 Cyg B,b,3,0,Radial Velocity,Multiple Instruments,798.5,13.5,G3 V,5750.0,1.13,1.08,4.36,7.4
5,17 Sco b,17 Sco,b,1,0,Radial Velocity,Hamilton Echelle Spectrograph,578.38,12.9,K3 III,4157.0,25.92,1.22,1.7,5.13


In [15]:
# Check count of Unique Values in each column:

print(new_exoplanet_df.nunique())

pl_name            1336
hostname            966
pl_letter             7
sy_snum               4
sy_mnum               1
discoverymethod       6
disc_instrument      50
pl_orbper          1333
pl_rade             644
st_spectype         214
st_teff             807
st_rad              313
st_mass             192
st_logg             218
st_age              423
dtype: int64


In [16]:
# Drop planets with more than 1 star:

new_exoplanet_df.drop(new_exoplanet_df.index[new_exoplanet_df['sy_snum'] > 1], inplace=True)

print(new_exoplanet_df.shape)

new_exoplanet_df.head()

(1099, 15)


Unnamed: 0,pl_name,hostname,pl_letter,sy_snum,sy_mnum,discoverymethod,disc_instrument,pl_orbper,pl_rade,st_spectype,st_teff,st_rad,st_mass,st_logg,st_age
1,11 UMi b,11 UMi,b,1,0,Radial Velocity,Coude Echelle Spectrograph,516.21997,12.3,K4 III,4213.0,29.79,2.78,1.93,1.56
2,14 And b,14 And,b,1,0,Radial Velocity,HIDES Echelle Spectrograph,185.84,12.9,K0 III,4813.0,11.0,2.2,2.63,4.5
3,14 Her b,14 Her,b,1,0,Radial Velocity,HIRES Spectrometer,1773.40002,12.9,K0 V,5338.0,0.93,0.9,4.45,3.9
5,17 Sco b,17 Sco,b,1,0,Radial Velocity,Hamilton Echelle Spectrograph,578.38,12.9,K3 III,4157.0,25.92,1.22,1.7,5.13
8,24 Boo b,24 Boo,b,1,0,Radial Velocity,HIDES Echelle Spectrograph,30.3506,13.9,G3 IV,4893.0,10.64,0.99,2.42,6.92


In [17]:
# Drop the "sy_snum" and "sy_mnum" columns as they both now have 1 unique value:

clean_exoplanet_df = new_exoplanet_df.drop(columns=['sy_snum', 'sy_mnum'])

print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

(1099, 13)


Unnamed: 0,pl_name,hostname,pl_letter,discoverymethod,disc_instrument,pl_orbper,pl_rade,st_spectype,st_teff,st_rad,st_mass,st_logg,st_age
1,11 UMi b,11 UMi,b,Radial Velocity,Coude Echelle Spectrograph,516.21997,12.3,K4 III,4213.0,29.79,2.78,1.93,1.56
2,14 And b,14 And,b,Radial Velocity,HIDES Echelle Spectrograph,185.84,12.9,K0 III,4813.0,11.0,2.2,2.63,4.5
3,14 Her b,14 Her,b,Radial Velocity,HIRES Spectrometer,1773.40002,12.9,K0 V,5338.0,0.93,0.9,4.45,3.9
5,17 Sco b,17 Sco,b,Radial Velocity,Hamilton Echelle Spectrograph,578.38,12.9,K3 III,4157.0,25.92,1.22,1.7,5.13
8,24 Boo b,24 Boo,b,Radial Velocity,HIDES Echelle Spectrograph,30.3506,13.9,G3 IV,4893.0,10.64,0.99,2.42,6.92


In [18]:
# Check count of Unique Values in each column:

print(clean_exoplanet_df.nunique())

pl_name            1099
hostname            777
pl_letter             7
discoverymethod       6
disc_instrument      45
pl_orbper          1097
pl_rade             558
st_spectype         195
st_teff             680
st_rad              289
st_mass             183
st_logg             204
st_age              377
dtype: int64


In [19]:
clean_exoplanet_df.dtypes

pl_name             object
hostname            object
pl_letter           object
discoverymethod     object
disc_instrument     object
pl_orbper          float64
pl_rade            float64
st_spectype         object
st_teff            float64
st_rad             float64
st_mass            float64
st_logg            float64
st_age             float64
dtype: object

In [20]:
# pl_name            Planet Name
# hostname           Star Name
# pl_letter          Planet Letter (First Discovered or Closest to Star if Multiple Disc at once,)
# discoverymethod    Discovery Method
# disc_instrument    Discovery Instrument
# pl_orbper          Planet Orbit in Days
# pl_rade            Planet Radius vs Earth
# st_spectype        Star Type
# st_teff            Star Temp
# st_rad             Star Radius
# st_mass            Star Mass
# st_logg            Star Gravity
# st_age             Star Age