In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
pd.options.mode.chained_assignment = None

In [2]:
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

Y_columns = ['koi_disposition', 'koi_pdisposition', 'koi_score']
misc_columns = ['kepid', 'kepoi_name', 'kepler_name', 'koi_tce_delivname']

train_X = train_df.drop(columns=Y_columns + misc_columns)
train_Y = train_df[Y_columns + misc_columns]

test_X = test_df.drop(columns=Y_columns + misc_columns)
test_Y = test_df[Y_columns + misc_columns]

In [3]:
les = {}
Y = pd.concat([train_Y, test_Y])
for dtype, col in zip(Y.dtypes, Y.columns):
  if dtype == 'object':
    les[col] = LabelEncoder()
    les[col].fit(Y[col])
    train_Y[col] = les[col].transform(train_Y[col])
    test_Y[col] = les[col].transform(test_Y[col])

### PCA

In [4]:
pca = PCA()
pca.fit(train_X)
count = 0
for s in pca.singular_values_:
    if s/pca.singular_values_[0] > 0.01:
        count += 1
pca_trans = PCA(n_components=count)
pca_trans.fit(train_X)
trans_train_X = pca_trans.transform(train_X)
trans_test_X = pca_trans.transform(test_X)
print(count)

4


### KOI Score - Disposition Score | With PCA

In [5]:
m = DecisionTreeRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: -0.14515204460771103
Score: -0.09048631057804091


In [6]:
m = KNeighborsRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.2787715146863131
Score: 0.2867274295892964


In [7]:
m = RandomForestRegressor()
cvs = cross_val_score(m, trans_train_X, train_Y['koi_score'], cv=5)
m.fit(trans_train_X, train_Y['koi_score'])
score = m.score(trans_test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.3548258961353795
Score: 0.3876047228576994


### KOI Score - Disposition Score | Without PCA

In [8]:
m = DecisionTreeRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.8890250672698857
Score: 0.8971120004510029


In [9]:
m = KNeighborsRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.3433131477492969
Score: 0.379687033794619


In [10]:
m = RandomForestRegressor()
cvs = cross_val_score(m, train_X, train_Y['koi_score'], cv=5)
m.fit(train_X, train_Y['koi_score'])
score = m.score(test_X, test_Y['koi_score'])
print(f'Cross Validation Score: {cvs.min()}\nScore: {score}')

Cross Validation Score: 0.9432490507274918
Score: 0.9547008290173025
