### This notebook consists of using the Gradient Boosting method to predict whether an object is an exoplanet candidate or not, based on the characteristics provided by the Kepler Space Observatory

Dataset from:

https://www.kaggle.com/nasa/kepler-exoplanet-search-results

Data dictionary:

https://exoplanetarchive.ipac.caltech.edu/docs/API_kepcandidate_columns.html

### Install the necessary packages

In [None]:
# !pip3 install pandas --upgrade
# !pip3 install scikit-learn --upgrade
# !pip3 install xgboost --upgrade

In [None]:
import pandas as pd

### Import Kepler Exoplanet dataset

In [None]:
df = pd.read_csv('cumulative.csv')
df.head()

### Imputation of missing values

In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

impute_zeros = SimpleImputer(
    missing_values=np.nan,
    strategy='constant',
    fill_value=0,
    verbose=0,
    copy=True
)

In [None]:
impute_zeros.fit(X=df)

In [None]:
df_imputed = pd.DataFrame.from_records(
    data=impute_zeros.transform(
        X=df
    ),
    columns=df.columns
)

### Removing columns

In [None]:
df_rmcolumns = df_imputed.drop(columns=['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_tce_delivname', 'koi_disposition'])

In [None]:
df_rmcolumns.columns

In [None]:
features = [
    'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss',
    'koi_fpflag_co', 'koi_fpflag_ec', 'koi_period', 'koi_period_err1',
    'koi_period_err2', 'koi_time0bk', 'koi_time0bk_err1',
    'koi_time0bk_err2', 'koi_impact', 'koi_impact_err1', 'koi_impact_err2',
    'koi_duration', 'koi_duration_err1', 'koi_duration_err2', 'koi_depth',
    'koi_depth_err1', 'koi_depth_err2', 'koi_prad', 'koi_prad_err1',
    'koi_prad_err2', 'koi_teq', 'koi_teq_err1', 'koi_teq_err2', 'koi_insol',
    'koi_insol_err1', 'koi_insol_err2', 'koi_model_snr', 'koi_tce_plnt_num',
    'koi_steff', 'koi_steff_err1', 'koi_steff_err2', 'koi_slogg',
    'koi_slogg_err1', 'koi_slogg_err2', 'koi_srad', 'koi_srad_err1',
    'koi_srad_err2', 'ra', 'dec', 'koi_kepmag'
]

target = [
    'koi_pdisposition'
]

X = df_rmcolumns[features]
y = df_rmcolumns[target]


### Splitting the data set into 60% for training, 30% for testing, and 10% for evaluation.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

X_test, X_eval, y_test, y_eval = train_test_split(X_test, y_test, test_size=0.25, random_state=42)

### Saving the file for evaluation

In [None]:
X_eval.to_csv('evaluate.csv', index=False)

In [None]:
# y_eval.to_csv('answer_sheet.csv', index=False, header=False)
y_eval.apply(lambda x: x.str.ljust(14)).to_csv('ANSWER_SHEET', index=False, header=False)

### Training the model

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=900,
    learning_rate=0.01,
    max_depth=5
).fit(X_train, y_train.values.ravel())

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy: {}%".format(100*round(accuracy_score(y_test, y_pred), 10)))

### Cross-validation

In [None]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print("Accuracy: %0.10f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Saving the model

In [None]:
from pickle import dump

dump(model, open('model.dat', 'wb'))