In [10]:
# Import dependencies:
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


#from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.metrics import balanced_accuracy_score
#from imblearn.metrics import classification_report_imbalanced
#from imblearn.ensemble import EasyEnsembleClassifier

In [11]:
# Load the data:
file_path = Path("../data/CLEAN_KEPLER_DATASET.csv")
exoplanet_df = pd.read_csv(file_path)

# Drop the null columns where all values are null:
#exoplanet_df = exoplanet_df.dropna(axis='columns', how='all')

# Drop the null rows:
#exoplanet_df = exoplanet_df.dropna()

print(exoplanet_df.shape)
exoplanet_df.head()

(9564, 141)


Unnamed: 0,rowid,kepid,kepoi_name,kepler_name,koi_disposition,koi_vet_stat,koi_vet_date,koi_pdisposition,koi_score,koi_fpflag_nt,...,koi_dicco_mdec,koi_dicco_mdec_err,koi_dicco_msky,koi_dicco_msky_err,koi_dikco_mra,koi_dikco_mra_err,koi_dikco_mdec,koi_dikco_mdec_err,koi_dikco_msky,koi_dikco_msky_err
0,1,10797460,K00752.01,Kepler-227 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,0.2,0.16,0.2,0.17,0.08,0.13,0.31,0.17,0.32,0.16
1,2,10797460,K00752.02,Kepler-227 c,CONFIRMED,Done,8/16/2018,CANDIDATE,0.969,0,...,0.0,0.48,0.39,0.36,0.49,0.34,0.12,0.73,0.5,0.45
2,3,10811496,K00753.01,,CANDIDATE,Done,8/16/2018,CANDIDATE,0.0,0,...,-0.034,0.07,0.042,0.072,0.002,0.071,-0.027,0.074,0.027,0.074
3,4,10848459,K00754.01,,FALSE POSITIVE,Done,8/16/2018,FALSE POSITIVE,0.0,0,...,0.147,0.078,0.289,0.079,-0.257,0.072,0.099,0.077,0.276,0.076
4,5,10854555,K00755.01,Kepler-664 b,CONFIRMED,Done,8/16/2018,CANDIDATE,1.0,0,...,-0.09,0.18,0.1,0.14,0.07,0.18,0.02,0.16,0.07,0.2


In [7]:

#exoplanet_df['sy_pnum'].value_counts()

In [8]:
# Count distinct values in "soltype" column:
exoplanet_df['koi_disposition'].value_counts()

FALSE POSITIVE    4839
CONFIRMED         2669
CANDIDATE         2056
Name: koi_disposition, dtype: int64

In [9]:
# Create a new dataframe by selecting desired columns:
new_exoplanet_df = exoplanet_df[["sy_snum", "sy_pnum",
                                 "pl_orbper", "koi_prad", "koi_disposition", "koi_teq", 
                                 "st_teff", "st_rad", "st_mass", "st_logg"]]


print(new_exoplanet_df.shape)
new_exoplanet_df.head()

KeyError: "['sy_snum', 'sy_pnum', 'pl_orbper', 'pl_rade', 'pl_eqt', 'st_teff', 'st_rad', 'st_mass', 'st_logg'] not in index"

In [None]:
# Dropna from new dataframe:
new_exoplanet_df= new_exoplanet_df.dropna()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

In [None]:
# Drop duplicates:
new_exoplanet_df = new_exoplanet_df.drop_duplicates()
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

In [None]:
# Count distinct values in "soltype" column
new_exoplanet_df['soltype'].value_counts()

In [None]:
# Drop planets with more than 1 star:
new_exoplanet_df.drop(new_exoplanet_df.index[new_exoplanet_df['sy_snum'] > 1], inplace=True)
print(new_exoplanet_df.shape)
new_exoplanet_df.head()

In [None]:
# Drop the "sy_snum" column as it now only has 1 unique value:
clean_exoplanet_df = new_exoplanet_df.drop(columns=['sy_snum'])
print(clean_exoplanet_df.shape)
clean_exoplanet_df.head()

In [None]:
#
clean_exoplanet_df['soltype'].value_counts()

In [None]:
# Replace values in 'soltype' with 1 or 0:

clean_exoplanet_df['soltype'].replace('Published Confirmed', 1, inplace=True)
clean_exoplanet_df[clean_exoplanet_df['soltype'] != 1] = 0 
clean_exoplanet_df

In [None]:
clean_exoplanet_df['soltype'].value_counts()

In [None]:
clean_exoplanet_df['soltype'] = clean_exoplanet_df['soltype'].astype(str).astype(int)

In [None]:
clean_exoplanet_df.columns = ['Number of Planets',
                              'Planet Orbit', 'Planet Radius', 'Planet Status', 'Equalibrium Temp',
                              'Star Temp', 'Star Radius', 'Star Mass', 'Star Gravity']

clean_exoplanet_df.head()

In [None]:
#clean_exoplanet_df['Number of Planets'].value_counts()

In [None]:
# Export the Dataframe as a new CSV file without the index.
#clean_exoplanet_df.to_csv("clean_exoplanet_df.csv", index=False)
clean_exoplanet_df.dtypes

# Mockup/ Draft Model
# Split the Data into Training and Testing

### Create a Supervised Machine Learning model, use Logistic Regression // classification 1 or 0.

In [None]:
# Create our features:
X = clean_exoplanet_df.drop("Planet Status", axis=1)

# Create our target:
y = clean_exoplanet_df["Planet Status"]

In [None]:
# Split into Train and Test Sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size= 0.33)

In [None]:
# Train the model:
lm = LogisticRegression()
lm.fit(X_train, y_train)

In [None]:
# Validate the model:
from sklearn.metrics import classification_report

predict = lm.predict(X_test)

with np.printoptions(threshold=np.inf):
    print(predict)

In [None]:
#predictions = lm.predict(X_test)
t1 = pd.DataFrame({"Prediction": predict, "Actual": y_test})
t1
#t1.nunique().value_counts()


In [None]:
#t1['Actual'].value_counts()

In [None]:
print(classification_report(y_test, predict))