In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
pd.options.mode.chained_assignment = None

In [2]:
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

Y_columns = ['koi_disposition', 'koi_pdisposition', 'koi_score']
misc_columns = ['kepid', 'kepoi_name', 'kepler_name', 'koi_tce_delivname']

train_X = train_df.drop(columns=Y_columns + misc_columns)
train_Y = train_df[Y_columns + misc_columns]

test_X = test_df.drop(columns=Y_columns + misc_columns)
test_Y = test_df[Y_columns + misc_columns]

In [3]:
les = {}
Y = pd.concat([train_Y, test_Y])
for dtype, col in zip(Y.dtypes, Y.columns):
  if dtype == 'object':
    les[col] = LabelEncoder()
    les[col].fit(Y[col])
    train_Y[col] = les[col].transform(train_Y[col])
    test_Y[col] = les[col].transform(test_Y[col])

### PCA

In [4]:
pca = PCA()
pca.fit(train_X)
count = 0
for s in pca.singular_values_:
    if s/pca.singular_values_[0] > 0.01:
        count += 1
pca_trans = PCA(n_components=count)
pca_trans.fit(train_X)
trans_train_X = pca_trans.transform(train_X)
trans_test_X = pca_trans.transform(test_X)
print(count)

4


In [5]:
for s in pca.singular_values_:
  print(s/pca.singular_values_[0])

1.0
0.37258464653720563
0.11525590394899117
0.06369390705845548
0.003937521975756601
0.0030938744477864264
0.002745615177389313
0.0018206059996874271
0.0015351272795401855
0.0003828646357052927
0.0003043074615647158
0.00017377962283244227
0.00012722143147526796
9.69638877383647e-05
4.967380765021818e-05
4.1999033105870064e-05
2.4923067169129242e-05
2.097955691357579e-05
1.607633104227409e-05
1.3785447770786843e-05
5.040022270275944e-06
3.1235082582259057e-06
2.8211103367709467e-06
2.3880735124608076e-06
1.9380300823707857e-06
1.8638722539147122e-06
1.7356892245824735e-06
1.440219541249027e-06
1.3462717018573192e-06
1.1057498268864805e-06
1.0604167795667362e-06
8.766701949831546e-07
3.865120058778914e-07
2.253968743853631e-07
1.0644558978175047e-07
2.20229044118726e-08
6.729891365778897e-17
6.729891365778897e-17
6.729891365778897e-17
6.729891365778897e-17


### Conclusion

At first glance it seems like PCA can be used to cut down the columns to a smaller dimension because most columns do not seem important.

### KOI Score - Disposition Score | With PCA

In [6]:
m = DecisionTreeRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: -0.1885149144241418
Score: -0.07327757838979743


In [7]:
m = KNeighborsRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.2787715146863131
Score: 0.2867274295892964


In [8]:
m = RandomForestRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.36707725043203965
Score: 0.3961136896596815


### Conclusion

Using PCA hurts every regressor's score because every data point is extremely important in determining whether a potential exoplanet is indeed an exoplanet.

### KOI Score - Disposition Score | Without PCA

In [9]:
m = DecisionTreeRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.8954566034912395
Score: 0.8982989627431021


In [10]:
m = KNeighborsRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.3433131477492969
Score: 0.379687033794619


In [11]:
m = RandomForestRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.9433921490761643
Score: 0.9559849071249745


### Conclusion

The random forest regressor, like the classifier, is unsurprisingly the best regressor to predict KOI score. It does make sense that the decision tree regressor is the second best regressor because the koi score is an amalgamation of predictors which directly relates to classification through a tree like structure. Lastly, like the classifier, the k nearest neighbor regressor was expected to perform poorly because there are so many dimensions and its hard to get a notion of "nearest."