In [1]:
# Import dependencies:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.metrics import balanced_accuracy_score
#from imblearn.metrics import classification_report_imbalanced
#from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# Load the data:
file_path = Path("../data/CLEAN_KEPLER_DATASET.csv")
exoplanet_df = pd.read_csv(file_path)


print(exoplanet_df.shape)
exoplanet_df.head()

(9564, 141)


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,8/16/2018,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,8/16/2018,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,8/16/2018,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [3]:
# Count distinct values in "koi_disposition" column:
exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4839
CONFIRMED         2669
CANDIDATE         2056
Name: koi_disposition, dtype: int64

In [4]:
# Create a new dataframe by selecting desired columns:
new_exoplanet_df = exoplanet_df[["koi_count", "koi_ror",
                                 "koi_period", "koi_prad", "koi_disposition", "koi_teq", 
                                 "koi_steff", "koi_srad", "koi_smass", "koi_slogg"]]


print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(9564, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_disposition,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg
0,2,0.022344,9.488036,2.26,CONFIRMED,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,CONFIRMED,443.0,5455.0,0.927,0.919,4.467
2,1,0.154046,19.89914,14.6,CANDIDATE,638.0,5853.0,0.868,0.961,4.544
3,1,0.387394,1.736952,33.46,FALSE POSITIVE,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,CONFIRMED,1406.0,6031.0,1.046,1.095,4.438


In [5]:
# Dropna from new dataframe:
new_exoplanet_df= new_exoplanet_df.dropna()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(9201, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_disposition,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg
0,2,0.022344,9.488036,2.26,CONFIRMED,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,CONFIRMED,443.0,5455.0,0.927,0.919,4.467
2,1,0.154046,19.89914,14.6,CANDIDATE,638.0,5853.0,0.868,0.961,4.544
3,1,0.387394,1.736952,33.46,FALSE POSITIVE,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,CONFIRMED,1406.0,6031.0,1.046,1.095,4.438


In [6]:
# Drop duplicates:
clean_exoplanet_df = new_exoplanet_df.drop_duplicates()
print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

(9201, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_disposition,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg
0,2,0.022344,9.488036,2.26,CONFIRMED,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,CONFIRMED,443.0,5455.0,0.927,0.919,4.467
2,1,0.154046,19.89914,14.6,CANDIDATE,638.0,5853.0,0.868,0.961,4.544
3,1,0.387394,1.736952,33.46,FALSE POSITIVE,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,CONFIRMED,1406.0,6031.0,1.046,1.095,4.438


In [7]:
# Count distinct values in "koi_disposition" column:
clean_exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4582
CONFIRMED         2668
CANDIDATE         1951
Name: koi_disposition, dtype: int64

In [8]:
# Give columns proper names:

clean_exoplanet_df.columns = ['Number of Planets', 'Planet-Star Radius Ratio',
                              'Orbit Period (Days)', 'Planet Radius (Earth)', 'Status', 'Equalibrium Temp (K)',
                              'Star Temp (K)', 'Star Radius', 'Star Mass', 'Star Surface Gravity']

clean_exoplanet_df.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Status,Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity
0,2,0.022344,9.488036,2.26,CONFIRMED,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,CONFIRMED,443.0,5455.0,0.927,0.919,4.467
2,1,0.154046,19.89914,14.6,CANDIDATE,638.0,5853.0,0.868,0.961,4.544
3,1,0.387394,1.736952,33.46,FALSE POSITIVE,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,CONFIRMED,1406.0,6031.0,1.046,1.095,4.438


In [9]:
# Replace values in 'Status' with 0 , 1 , or 2:
nm_value_df = clean_exoplanet_df.replace({'CONFIRMED':1, 'FALSE POSITIVE':0, 'CANDIDATE':2})
nm_value_df.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Status,Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity
0,2,0.022344,9.488036,2.26,1,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,1,443.0,5455.0,0.927,0.919,4.467
2,1,0.154046,19.89914,14.6,2,638.0,5853.0,0.868,0.961,4.544
3,1,0.387394,1.736952,33.46,0,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,1,1406.0,6031.0,1.046,1.095,4.438


In [10]:
# Check values in "Status" column:
nm_value_df['Status'].value_counts()

0    4582
1    2668
2    1951
Name: Status, dtype: int64

In [11]:
# Check data types in columns:
nm_value_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Status                        int64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
dtype: object

In [12]:
# Create a DF for Verified Planets
verified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'] == 2])
print(verified_planets_df.shape)
verified_planets_df.head()

(7250, 10)


Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Status,Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity
0,2,0.022344,9.488036,2.26,1,793.0,5455.0,0.927,0.919,4.467
1,2,0.027954,54.418383,2.83,1,443.0,5455.0,0.927,0.919,4.467
3,1,0.387394,1.736952,33.46,0,1395.0,5805.0,0.791,0.836,4.564
4,1,0.024064,2.525592,2.75,1,1406.0,6031.0,1.046,1.095,4.438
5,3,0.036779,11.094321,3.9,1,835.0,6046.0,0.972,1.053,4.486


In [13]:
# Check values in "Status" column:
verified_planets_df['Status'].value_counts()

0    4582
1    2668
Name: Status, dtype: int64

In [14]:
# Create a DF for Unverified Planets
unverified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'].isin([1, 0])])
print(unverified_planets_df.shape)
unverified_planets_df.head()

(1951, 10)


Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Status,Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity
2,1,0.154046,19.89914,14.6,2,638.0,5853.0,0.868,0.961,4.544
37,1,0.103379,4.959319,12.21,2,1103.0,5712.0,1.082,0.976,4.359
58,1,0.088069,40.419504,7.51,2,467.0,5446.0,0.781,0.714,4.507
62,2,0.232818,7.240661,19.45,2,734.0,5005.0,0.765,0.85,4.595
63,3,0.004612,3.435916,0.55,2,1272.0,5779.0,1.087,0.941,4.339


In [15]:
# Check values in "Status" column:
unverified_planets_df['Status'].value_counts()

2    1951
Name: Status, dtype: int64

In [16]:
# Check data types in columns:
verified_planets_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Status                        int64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
dtype: object

In [17]:
# Check data types in columns:
unverified_planets_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Status                        int64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
dtype: object

# Mockup/ Draft Model
# Split the Data into Training and Testing

### Create a Supervised Machine Learning model, use Logistic Regression // classification 1 or 0.

In [18]:
# Create our features:
X = verified_planets_df.drop("Status", axis=1)

# Create our target:
y = verified_planets_df["Status"]

In [37]:
# Split into Train and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y,  test_size= 0.33)

In [38]:
# Train the model:
lm = LogisticRegression()
lm.fit(X_train, y_train)

LogisticRegression()

In [39]:
# Validate the model:
from sklearn.metrics import classification_report

predict = lm.predict(X_test)

# Print all results:
with np.printoptions(threshold=np.inf):
    print(predict)

[0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1
 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0
 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 1 0 1 0
 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 1
 0 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 1
 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0
 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 0
 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 1 0 0 0 1 1 1 0 0
 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 1
 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0
 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0
 1 0 0 1 0 1 0 1 0 0 0 0 

In [41]:
# DF for output:
t1 = pd.DataFrame({"Prediction": predict, "Actual": y_test})
t1.head(10)

Unnamed: 0,Prediction,Actual
4503,0,0
1690,1,1
1817,1,1
7532,0,0
2553,0,0
5011,0,0
4450,0,0
1133,0,0
1950,1,1
2631,1,0


In [42]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.90      0.86      0.88      1512
           1       0.77      0.83      0.80       881

    accuracy                           0.85      2393
   macro avg       0.83      0.84      0.84      2393
weighted avg       0.85      0.85      0.85      2393

