In [15]:
#import dependancies
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
import joblib

In [26]:
#file path and importing data
os.chdir(r"C:\Users\raamt\Desktop\Kepler Exoplanet Search\Resource")
data = pd.read_csv("Exoplanet_data.csv")

# Drop null columns
data = data.dropna(axis='columns', how='all')

# Drop null rows
data = data.dropna()

# Convert dtypes of int64 to float64
for column, content in data.items():
    if data[column].dtype == 'int64':
        data = data.astype({column: 'float64'})
data.sample(5)

Unnamed: 0,koi_disposition,koi_fpflag_nt,koi_fpflag_ss,koi_fpflag_co,koi_fpflag_ec,koi_period,koi_period_err1,koi_period_err2,koi_time0bk,koi_time0bk_err1,...,koi_steff_err2,koi_slogg,koi_slogg_err1,koi_slogg_err2,koi_srad,koi_srad_err1,koi_srad_err2,ra,dec,koi_kepmag
9265,FALSE POSITIVE,0.0,0.0,1.0,0.0,20.004714,0.00147,-0.00147,139.0639,0.0165,...,-159.0,4.534,0.043,-0.119,0.848,0.138,-0.074,293.7041,45.892509,14.437
1526,CONFIRMED,0.0,0.0,0.0,0.0,3.292768,1.2e-05,-1.2e-05,133.00741,0.00312,...,-104.0,4.108,0.195,-0.09,1.472,0.229,-0.344,290.06049,47.16396,12.727
5155,CANDIDATE,0.0,0.0,0.0,0.0,2.508116,1.5e-05,-1.5e-05,133.98497,0.00524,...,-46.0,2.435,0.117,-0.143,8.769,3.319,-1.106,292.95416,41.934799,15.357
5950,FALSE POSITIVE,0.0,1.0,0.0,0.0,1.01836,1e-06,-1e-06,131.67768,0.00182,...,-237.0,4.144,0.273,-0.168,1.463,0.419,-0.419,292.1795,39.238621,14.299
5979,FALSE POSITIVE,1.0,0.0,0.0,0.0,14.364154,0.000267,-0.000267,144.2631,0.0162,...,-166.0,4.364,0.13,-0.238,1.033,0.355,-0.161,294.57065,39.331402,14.016


In [21]:
# Assign data to X and y #koi disposition is confirmed/false positive/false negative
X = data.drop("koi_disposition", axis=1)
y = data["koi_disposition"]

# Split data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [22]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
LogReg = LogisticRegression(solver='newton-cg', multi_class='auto')
LogReg.fit(X_train_scaled, y_train)

LogReg_training_score = round(LogReg.score(X_train_scaled, y_train)*100,3)
base_accuracy = round(LogReg.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {LogReg_training_score} %")
print(f"Testing Data Score: {base_accuracy} %")

Training Data Score: 85.499 %
Testing Data Score: 84.401 %


In [28]:
# Evaluate features #RFECV Recursive Feature Elimination with cross validation
feature_names = X.columns.tolist()
selector = RFECV(estimator=LogReg, cv=5, step=1)
_ = selector.fit(X_train_scaled, y_train)

In [29]:
# Determine which features ought to be kept
preSelected_features = sorted(zip(selector.ranking_, feature_names))
ranked_features = pd.DataFrame(preSelected_features, columns=['Ranking', 'Feature'])
ranked_features = ranked_features.set_index('Feature')
ranked_features

Unnamed: 0_level_0,Ranking
Feature,Unnamed: 1_level_1
dec,1
koi_depth,1
koi_duration,1
koi_duration_err1,1
koi_duration_err2,1
koi_fpflag_co,1
koi_fpflag_ec,1
koi_fpflag_nt,1
koi_fpflag_ss,1
koi_impact,1


In [33]:
# Remove features with Ranking > 16
selected_features = []
for tup in preSelected_features:
    if tup[0] < 17:
        selected_features.append(tup[1])

In [34]:
# Use new data for all subsequent models
## Assign new data to X 
X_train_select = X_train[selected_features]
X_test_select = X_test[selected_features]

X_scaler = MinMaxScaler().fit(X_train_select)
X_train_scaled = X_scaler.transform(X_train_select)
X_test_scaled = X_scaler.transform(X_test_select)

## Train new model
model_2 = LogisticRegression(solver='newton-cg', multi_class='auto')
model_2.fit(X_train_scaled, y_train)

model_2_training_score = round(model_2.score(X_train_scaled, y_train)*100,3)
select_features_accuracy = round(model_2.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_2_training_score} %")
print(f"Testing Data Score: {select_features_accuracy} %")

Training Data Score: 85.499 %
Testing Data Score: 84.401 %


In [39]:
# Create the GridSearchCV model
model_3 = LogisticRegression(solver='newton-cg', multi_class='auto')

param_grid = {
    'C': np.logspace(0, 4, 10),
    'penalty': ['l2']
}
grid = GridSearchCV(model_3, param_grid, cv=5, verbose=0)

# Train the model with GridSearch
_ = grid.fit(X_train_scaled, y_train)

In [38]:
# Tuned parameters
C = grid.best_params_['C']
penalty = grid.best_params_['penalty']

# Tuned model
tuned_model = LogisticRegression(solver='newton-cg', multi_class='auto',
                                 C=C, penalty=penalty)
tuned_model.fit(X_train_scaled, y_train)

model_3_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")

Training Data Score: 88.808 %
Testing Data Score: 88.518 %


In [41]:
#predictions
predictions = tuned_model.predict(X_test_scaled)
classifications = y_test.unique().tolist()

prediction_actual = {
    'Actual': y_test,
    'Prediction': predictions
}

PA_df = pd.DataFrame(prediction_actual)
PA_df = PA_df.set_index('Actual').reset_index()
PA_df.head(15)

Unnamed: 0,Actual,Prediction
0,FALSE POSITIVE,FALSE POSITIVE
1,CONFIRMED,CONFIRMED
2,FALSE POSITIVE,FALSE POSITIVE
3,FALSE POSITIVE,FALSE POSITIVE
4,CONFIRMED,CONFIRMED
5,FALSE POSITIVE,FALSE POSITIVE
6,FALSE POSITIVE,FALSE POSITIVE
7,CONFIRMED,CONFIRMED
8,CONFIRMED,CONFIRMED
9,CANDIDATE,CANDIDATE


In [43]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{base_accuracy}%", f"{select_features_accuracy}%", f"{tuned_accuracy}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv('LogisticRegression_eval.csv')
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,84.401%
Select Features Model,84.401%
Tuned Model,88.518%


In [45]:
filename = 'OtherModel_LogisticRegression.sav'
_ = joblib.dump(tuned_model, filename)