In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Loading Data

In [None]:
filename = '../data/exoplanet_data.csv'
raw_planets_df = pd.read_csv(filename)
raw_planets_df.head()

## Cleaning Data

In [None]:
# looking at the unique labels
raw_planets_df.koi_disposition.unique()

In [None]:
# looking at number of features
raw_planets_df.shape[1]

In [None]:
# removing blank columns
less_raw_planets_df = raw_planets_df.dropna(axis='columns', how='all')
less_raw_planets_df.shape[1]

In [None]:
# removing rows with missing data
planets_df = less_raw_planets_df.dropna()
planets_df.head()

In [None]:
# just printing a list of all features and outputs
# just for my own benefit, this cell does not need to be run
for col in planets_df.columns:
    print(col)

Seems like the columns I need to use are already given to me.
I want to use Koi_disposition as my label.

### Selecting Features

In [None]:
# selecting features for model
selected_features = planets_df.drop(columns=['koi_disposition'])

In [None]:
# selecting Output
y = planets_df['koi_disposition']

## Train-Test-Split

In [None]:
# doing a train_test_split with a random state of 42, will only be the same the first time ran
X_train, X_test, y_train, y_test = train_test_split(selected_features, y, random_state=42)

## Scaling X's

In [None]:
# importing scaling, encoding and categorizing materials
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [None]:
# scaling X's
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Encoding y's
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

# Categorizing y's
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

## Training Model

In [None]:
# borrowed the meat of this cell of code from a in class example w21-d2-e5 
train_scores = []
test_scores = []

begin = 1
end = 50
middle = 2

# testing out different models with different values of k
for k in range(begin, end, middle):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train_categorical)
    train_score = knn.score(X_train_scaled, y_train_categorical)
    test_score = knn.score(X_test_scaled, y_test_categorical)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
# then plotting them to find where they level off
plt.plot(range(begin, end, middle), train_scores, marker='o')
plt.plot(range(begin, end, middle), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# after choosing my level of k, 
# I build my model and test its accuracy on the test scores
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train_categorical)
print('k=15 Test Acc: %.3f' % knn.score(X_test_scaled, y_test_categorical))