# Exoplanet Classifier

This is a Random Forest binary classifier that identifies exoplanets. I made this for the NASA Space Apps Challenge to learn more about ML.

### Table of Contents:
1. [Get and Clean Data](#get-and-clean-data)
2. [Split Data into Training and Test Sets](#split-data)
3. [Train Model](#train-model)
4. [Summary Statistics](#summary-statistics)

## Get and Clean Data

In [7]:
# get data
import pandas as pd

data = pd.read_csv("data.csv", comment="#")
data

Unnamed: 0,kepid,kepoi_name,kepler_name,koi_disposition,koi_pdisposition,koi_score,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
0,10797460,K00752.01,Kepler-227 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
1,10797460,K00752.02,Kepler-227 c,CONFIRMED,CANDIDATE,0.969,0,0,0,0,...,-81.0,4.467,0.064,-0.096,0.927,0.105,-0.061,291.93423,48.141651,15.347
2,10811496,K00753.01,,CANDIDATE,CANDIDATE,0.000,0,0,0,0,...,-176.0,4.544,0.044,-0.176,0.868,0.233,-0.078,297.00482,48.134129,15.436
3,10848459,K00754.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,0,0,...,-174.0,4.564,0.053,-0.168,0.791,0.201,-0.067,285.53461,48.285210,15.597
4,10854555,K00755.01,Kepler-664 b,CONFIRMED,CANDIDATE,1.000,0,0,0,0,...,-211.0,4.438,0.070,-0.210,1.046,0.334,-0.133,288.75488,48.226200,15.509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,10090151,K07985.01,,FALSE POSITIVE,FALSE POSITIVE,0.000,0,1,1,0,...,-166.0,4.529,0.035,-0.196,0.903,0.237,-0.079,297.18875,47.093819,14.082
9560,10128825,K07986.01,,CANDIDATE,CANDIDATE,0.497,0,0,0,0,...,-220.0,4.444,0.056,-0.224,1.031,0.341,-0.114,286.50937,47.163219,14.757
9561,10147276,K07987.01,,FALSE POSITIVE,FALSE POSITIVE,0.021,0,0,1,0,...,-236.0,4.447,0.056,-0.224,1.041,0.341,-0.114,294.16489,47.176281,15.385
9562,10155286,K07988.01,,CANDIDATE,CANDIDATE,0.092,0,0,0,0,...,-128.0,2.992,0.030,-0.027,7.824,0.223,-1.896,296.76288,47.145142,10.998


In [None]:
# clean data
data["target"] = data["koi_disposition"].map({"CONFIRMED": 1, "CANDIDATE": 1, "FALSE POSITIVE": 0})
features = ["koi_period", "koi_impact", "koi_duration", "koi_depth", "koi_prad", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad", "koi_fpflag_nt", "koi_fpflag_co", "koi_fpflag_ss", "koi_fpflag_ec"]

y = data["target"]
x = data[features]

# average if value n/a
x.fillna(x.median())

x


Unnamed: 0,koi_period,koi_impact,koi_duration,koi_depth,koi_prad,koi_model_snr,koi_steff,koi_slogg,koi_srad,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ss,koi_fpflag_ec
0,9.488036,0.146,2.95750,615.8,2.26,35.8,5455.0,4.467,0.927,0,0,0,0
1,54.418383,0.586,4.50700,874.8,2.83,25.8,5455.0,4.467,0.927,0,0,0,0
2,19.899140,0.969,1.78220,10829.0,14.60,76.3,5853.0,4.544,0.868,0,0,0,0
3,1.736952,1.276,2.40641,8079.2,33.46,505.6,5805.0,4.564,0.791,0,0,1,0
4,2.525592,0.701,1.65450,603.3,2.75,40.9,6031.0,4.438,1.046,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9559,0.527699,1.252,3.22210,1579.2,29.35,453.3,5638.0,4.529,0.903,0,1,1,0
9560,1.739849,0.043,3.11400,48.5,0.72,10.6,6119.0,4.444,1.031,0,0,0,0
9561,0.681402,0.147,0.86500,103.6,1.07,12.3,6173.0,4.447,1.041,0,1,0,0
9562,333.486169,0.214,3.19900,639.1,19.30,14.0,4989.0,2.992,7.824,0,0,0,0


## Split Data

In [142]:
# split data into training and testing datasets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1, stratify = y)

In [143]:
x_train

Unnamed: 0,koi_period,koi_impact,koi_duration,koi_depth,koi_prad,koi_model_snr,koi_steff,koi_slogg,koi_srad,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ss,koi_fpflag_ec
4480,2.150523,0.0303,3.44700,115.5,1.17,11.0,5780.0,4.438,1.000,0,1,0,1
4603,1.381257,0.3510,1.12500,92.3,1.10,11.0,5861.0,4.385,1.131,1,0,0,0
8971,3.093832,0.3560,1.57600,373.7,1.32,9.7,5327.0,4.646,0.681,0,0,0,0
8938,7.448376,0.3500,4.23892,152670.0,34.37,1837.9,5636.0,4.489,0.867,0,0,1,0
9361,185.830130,0.0370,10.14000,72.6,1.01,8.1,6137.0,4.286,1.187,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6960,1.807743,0.9062,2.63700,592.6,5.29,15.1,5164.0,3.880,1.779,0,0,1,1
6725,77.395279,0.1748,5.26000,437.0,6.16,7.9,5158.0,3.541,2.995,1,0,0,0
8920,6.208918,0.6977,2.29200,1664.0,9.20,8.4,5548.0,3.797,2.113,0,1,0,0
7816,13.153841,0.3900,11.56000,104.6,1.03,10.6,5443.0,4.360,1.008,0,0,0,0


In [28]:
x_test

Unnamed: 0,koi_period,koi_impact,koi_duration,koi_depth,koi_prad,koi_model_snr,koi_steff,koi_slogg,koi_srad,koi_fpflag_nt,koi_fpflag_co,koi_fpflag_ss,koi_fpflag_ec
9066,0.933697,0.5700,4.4440,81.4,0.78,10.8,5619.0,4.558,0.838,0,0,0,1
3891,1.691194,0.9030,9.4790,2801.0,6.36,26.7,5780.0,4.438,1.000,1,1,0,0
4905,57.059999,0.1940,7.4630,66168.0,22.31,751.5,5931.0,4.545,0.863,0,0,1,0
5835,1.891299,0.6990,5.2940,82.0,0.86,29.8,5806.0,4.507,0.893,0,1,0,1
218,11.523067,0.0000,3.8757,507.3,3.13,142.0,5644.0,4.162,1.411,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1362,1.907811,0.4100,1.8707,120.5,0.91,40.9,5435.0,4.578,0.824,0,0,0,0
503,8.360616,0.2100,7.5385,28066.0,40.35,425.6,5463.0,3.790,2.445,0,0,1,0
5823,2.696336,1.1410,5.2380,238.6,15.78,26.5,6106.0,4.496,0.923,0,1,1,1
9149,67.412998,0.5109,9.1000,126.1,1.57,6.2,6525.0,4.248,1.359,0,0,0,0


## Train Model

In [None]:
# number crunch
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 250, random_state = 1)
rf.fit(x_train, y_train)

y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)

## Summary Statistics

In [141]:
# display data
rf_results = pd.DataFrame([pd.Series(y_test).map({ 1: "Yes", 0: "No"}).reset_index(drop = True), pd.Series(y_test_pred).map({ 1: "Yes", 0: "No"})]).transpose()
rf_results.columns = ["Actual", "Predicted"]

print("Is exoplanet?")
rf_results

Is exoplanet?


Unnamed: 0,Actual,Predicted
0,No,No
1,No,No
2,Yes,Yes
3,No,No
4,Yes,Yes
...,...,...
1908,Yes,Yes
1909,Yes,Yes
1910,No,No
1911,Yes,Yes


In [81]:
# final statistics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, log_loss

rf_results = pd.DataFrame(["Random Forest Classifier", roc_auc_score(y_test, y_test_pred), accuracy_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)]).transpose()
rf_results.columns = ["Method", "ROC_AUC", "Accuracy", "F-Score"]

rf_results

Unnamed: 0,Method,ROC_AUC,Accuracy,F-Score
0,Random Forest Classifier,0.99161,0.991636,0.991516
