In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Loading Data

In [None]:
filename = '../data/exoplanet_data.csv'
df = pd.read_csv(filename)
df.head()

## Cleaning Data

In [None]:
# Went through the exploratory process in KNN file,
# this is a cleaned up version of what I did there.
df = df.dropna(axis='columns', how='all')
df = df.dropna()
df.head()

## Selecting Features

In [None]:
# selecting features for model
X = df.drop(columns=['koi_disposition'])
# selecting Output
y = df['koi_disposition']

## Train-Test-Split

In [None]:
# doing a train_test_split with a random state of 42, will only be the same the first time ran
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Scaling & Categorizing

In [None]:
# importing scaling, encoding and categorizing materials
# from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler

In [None]:
# scaling X's
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# making a single column that carries the numerical values for the labels
dummy_train = pd.get_dummies(y_train)
y_train_dummied = dummy_train.CANDIDATE # + dummy_train.CONFIRMED

dummy_test = pd.get_dummies(y_test)
y_test_dummied = dummy_test.CANDIDATE # + dummy_test.CONFIRMED

y_train_dummied.unique()

# Train Model

In [None]:
# create logistic model
model = LogisticRegression()

In [None]:
# fit that model to scaled and categorized training data
model.fit(X_train_scaled, y_train_dummied)

In [None]:
print(f'Training Score: {model.score(X_train_scaled, y_train_dummied)}')
print(f'Testing Score: {model.score(X_test_scaled, y_test_dummied)}')
      

In [None]:
# making predictions for scoring
predictions = model.predict(X_test_scaled)

In [None]:
# Looking at a classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_dummied, predictions,
                            target_names=["Non Candidate", "Candidate"]))