In [1]:
# Import dependencies:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.metrics import balanced_accuracy_score
#from imblearn.metrics import classification_report_imbalanced
#from imblearn.ensemble import EasyEnsembleClassifier

In [2]:
# Load the data:
file_path = Path("../data/CLEAN_KEPLER_DATASET.csv")
exoplanet_df = pd.read_csv(file_path)


print(exoplanet_df.shape)
exoplanet_df.head()

(9564, 141)


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,8/16/2018,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,8/16/2018,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,8/16/2018,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [3]:
# Count distinct values in "koi_disposition" column:
exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4839
CONFIRMED         2669
CANDIDATE         2056
Name: koi_disposition, dtype: int64

In [4]:
# Create a new dataframe by selecting desired columns:
new_exoplanet_df = exoplanet_df[["koi_count", "koi_ror",
                                 "koi_period", "koi_prad", "koi_teq", 
                                 "koi_steff", "koi_srad", "koi_smass", "koi_slogg", "koi_disposition"]]


print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(9564, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg,koi_disposition
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,CONFIRMED
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,CONFIRMED
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,CANDIDATE
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,FALSE POSITIVE
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,CONFIRMED


In [5]:
# Dropna from new dataframe:
new_exoplanet_df= new_exoplanet_df.dropna()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

(9201, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg,koi_disposition
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,CONFIRMED
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,CONFIRMED
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,CANDIDATE
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,FALSE POSITIVE
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,CONFIRMED


In [6]:
# Drop duplicates:
clean_exoplanet_df = new_exoplanet_df.drop_duplicates()
print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

(9201, 10)


Unnamed: 0,koi_count,koi_ror,koi_period,koi_prad,koi_teq,koi_steff,koi_srad,koi_smass,koi_slogg,koi_disposition
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,CONFIRMED
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,CONFIRMED
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,CANDIDATE
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,FALSE POSITIVE
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,CONFIRMED


In [7]:
# Count distinct values in "koi_disposition" column:
clean_exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4582
CONFIRMED         2668
CANDIDATE         1951
Name: koi_disposition, dtype: int64

In [8]:
# Give columns proper names:

clean_exoplanet_df.columns = ['Number of Planets', 'Planet-Star Radius Ratio',
                              'Orbit Period (Days)', 'Planet Radius (Earth)', 'Equalibrium Temp (K)',
                              'Star Temp (K)', 'Star Radius', 'Star Mass', 'Star Surface Gravity', 'Status']

clean_exoplanet_df.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Status
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,CONFIRMED
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,CONFIRMED
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,CANDIDATE
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,FALSE POSITIVE
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,CONFIRMED


In [9]:
# Replace values in 'Status' with 0 , 1 , or 2:
nm_value_df = clean_exoplanet_df.replace({'CONFIRMED':1, 'FALSE POSITIVE':0, 'CANDIDATE':2})
nm_value_df.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Status
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,1
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,1
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,2
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,0
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,1


In [10]:
# Check values in "Status" column:
nm_value_df['Status'].value_counts()

0    4582
1    2668
2    1951
Name: Status, dtype: int64

In [11]:
# Check data types in columns:
nm_value_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
Status                        int64
dtype: object

In [12]:
# Create a DF for Verified Planets
verified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'] == 2])
print(verified_planets_df.shape)
verified_planets_df.head()

(7250, 10)


Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Status
0,2,0.022344,9.488036,2.26,793.0,5455.0,0.927,0.919,4.467,1
1,2,0.027954,54.418383,2.83,443.0,5455.0,0.927,0.919,4.467,1
3,1,0.387394,1.736952,33.46,1395.0,5805.0,0.791,0.836,4.564,0
4,1,0.024064,2.525592,2.75,1406.0,6031.0,1.046,1.095,4.438,1
5,3,0.036779,11.094321,3.9,835.0,6046.0,0.972,1.053,4.486,1


In [13]:
# Check values in "Status" column:
verified_planets_df['Status'].value_counts()

0    4582
1    2668
Name: Status, dtype: int64

In [14]:
# Create a DF for Unverified Planets
unverified_planets_df = nm_value_df.drop(nm_value_df.index[nm_value_df['Status'].isin([1, 0])])
print(unverified_planets_df.shape)
unverified_planets_df.head()

(1951, 10)


Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Status
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,2
37,1,0.103379,4.959319,12.21,1103.0,5712.0,1.082,0.976,4.359,2
58,1,0.088069,40.419504,7.51,467.0,5446.0,0.781,0.714,4.507,2
62,2,0.232818,7.240661,19.45,734.0,5005.0,0.765,0.85,4.595,2
63,3,0.004612,3.435916,0.55,1272.0,5779.0,1.087,0.941,4.339,2


In [15]:
# Check values in "Status" column:
unverified_planets_df['Status'].value_counts()

2    1951
Name: Status, dtype: int64

In [16]:
# Check data types in columns:
verified_planets_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
Status                        int64
dtype: object

In [17]:
# Check data types in columns:
unverified_planets_df.dtypes

Number of Planets             int64
Planet-Star Radius Ratio    float64
Orbit Period (Days)         float64
Planet Radius (Earth)       float64
Equalibrium Temp (K)        float64
Star Temp (K)               float64
Star Radius                 float64
Star Mass                   float64
Star Surface Gravity        float64
Status                        int64
dtype: object

# Mockup/ Draft Model
# Split the Data into Training and Testing

### Create a Supervised Machine Learning model, use Logistic Regression // classification 1 or 0.

In [18]:
# Create our features:
X = verified_planets_df.drop("Status", axis=1)

# Create our target:
y = verified_planets_df["Status"]

In [45]:
# Split into Train and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify = y,  test_size= 0.5)

In [46]:
# Train the model:
lm = LogisticRegression()
lm.fit(X_train, y_train)

LogisticRegression()

In [47]:
# Validate the model:
from sklearn.metrics import classification_report

predict = lm.predict(X_test)

# Print all results:
with np.printoptions(threshold=np.inf):
    print(predict)

[0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0
 1 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0
 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1
 0 1 1 0 0 1 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 1 0
 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 0 0 1 0 1 0 1 1 0 1
 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0
 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0
 1 1 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0
 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 0 1 1 0 0 0 0
 1 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 0 1
 1 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0
 1 0 1 0 1 0 0 1 0 0 0 1 

In [48]:
# DF for output:
t1 = pd.DataFrame({"Prediction": predict, "Actual": y_test})
t1.head(15)

Unnamed: 0,Prediction,Actual
5672,0,0
2244,0,0
5996,1,1
2982,1,1
217,1,1
3243,1,1
3472,1,0
5209,0,0
5971,0,0
4199,0,0


In [49]:
# Print the confusion matrix:

from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, predict)
print(matrix)

[[1979  312]
 [ 246 1088]]


In [50]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      2291
           1       0.78      0.82      0.80      1334

    accuracy                           0.85      3625
   macro avg       0.83      0.84      0.84      3625
weighted avg       0.85      0.85      0.85      3625



In [51]:
unverified_planets_df.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Status
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,2
37,1,0.103379,4.959319,12.21,1103.0,5712.0,1.082,0.976,4.359,2
58,1,0.088069,40.419504,7.51,467.0,5446.0,0.781,0.714,4.507,2
62,2,0.232818,7.240661,19.45,734.0,5005.0,0.765,0.85,4.595,2
63,3,0.004612,3.435916,0.55,1272.0,5779.0,1.087,0.941,4.339,2


In [52]:

X_new = unverified_planets_df.drop("Status", axis=1)

#y_new = unverified_planets_df["Status"]

X_new.head()

Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544
37,1,0.103379,4.959319,12.21,1103.0,5712.0,1.082,0.976,4.359
58,1,0.088069,40.419504,7.51,467.0,5446.0,0.781,0.714,4.507
62,2,0.232818,7.240661,19.45,734.0,5005.0,0.765,0.85,4.595
63,3,0.004612,3.435916,0.55,1272.0,5779.0,1.087,0.941,4.339


In [53]:
unvpl = lm.predict(X_new)

In [54]:
#count_is_p = (unvpl == 1).sum()
#count_is_p

In [55]:
#count_not_p = (unvpl == 0).sum()
#count_not_p

In [56]:
X_new['Prediction'] = unvpl.tolist()
print(X_new.shape)
X_new.head(15)

(1951, 10)


Unnamed: 0,Number of Planets,Planet-Star Radius Ratio,Orbit Period (Days),Planet Radius (Earth),Equalibrium Temp (K),Star Temp (K),Star Radius,Star Mass,Star Surface Gravity,Prediction
2,1,0.154046,19.89914,14.6,638.0,5853.0,0.868,0.961,4.544,0
37,1,0.103379,4.959319,12.21,1103.0,5712.0,1.082,0.976,4.359,0
58,1,0.088069,40.419504,7.51,467.0,5446.0,0.781,0.714,4.507,1
62,2,0.232818,7.240661,19.45,734.0,5005.0,0.765,0.85,4.595,0
63,3,0.004612,3.435916,0.55,1272.0,5779.0,1.087,0.941,4.339,1
84,1,0.084708,10.181584,7.73,812.0,5988.0,0.836,0.885,4.541,0
92,1,0.137595,19.620347,13.6,643.0,5710.0,0.905,0.928,4.492,0
112,1,0.080046,34.843986,7.61,511.0,5509.0,0.871,0.84,4.482,1
118,1,0.333559,4.6409,30.09,1155.0,6463.0,0.826,0.786,4.5,0
123,1,0.673134,1.028437,72.77,1947.0,6228.0,0.99,0.959,4.428,0


In [57]:
X_new['Prediction'].value_counts()

1    1065
0     886
Name: Prediction, dtype: int64