In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt

test_camera = pd.read_csv("galaxy-star-or-quasar/test_sdss_camera.csv", index_col="id")
train_camera = pd.read_csv("galaxy-star-or-quasar/train_sdss_camera.csv", index_col="id")
test_position = pd.read_csv("galaxy-star-or-quasar/test_sky_position.csv", index_col="id")
train_position = pd.read_csv("galaxy-star-or-quasar/train_sky_position.csv", index_col="id")

# Merging
train_set = train_camera.merge(train_position, on='id', how='left')
test_set = test_camera.merge(test_position, on='id', how='left')

# Dummies for Sky_Zone
sz_dum = pd.get_dummies(train_set['sky_zone'])
sz_dum_t = pd.get_dummies(test_set['sky_zone'])

# Merging dummies to dfs
train_set = train_set.merge(sz_dum, left_on='id', right_on='id')
test_set = test_set.merge(sz_dum_t, left_on='id', right_on='id')

#dropping Sky_Zone
train_set = train_set.drop('sky_zone', axis=1)
test_set = test_set.drop('sky_zone', axis=1)

# Dropping constant variables
train_set = train_set.drop(columns=['rerun'])
test_set = test_set.drop(columns=['rerun'])

In [2]:
# Replacing missing values in u,g,r,i,z variables by the mean of each of these variables. 

train_set['u'] = train_set['u'].fillna(train_set['u'].mean())
train_set['g'] = train_set['g'].fillna(train_set['g'].mean())
train_set['r'] = train_set['r'].fillna(train_set['r'].mean())
train_set['i'] = train_set['i'].fillna(train_set['i'].mean())
train_set['z'] = train_set['z'].fillna(train_set['z'].mean())

test_set['u'] = test_set['u'].fillna(test_set['u'].mean())
test_set['g'] = test_set['g'].fillna(test_set['g'].mean())
test_set['r'] = test_set['r'].fillna(test_set['r'].mean())
test_set['i'] = test_set['i'].fillna(test_set['i'].mean())
test_set['z'] = test_set['z'].fillna(test_set['z'].mean())

In [3]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18000 entries, 1 to 18000
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   u            18000 non-null  float64
 1   g            18000 non-null  float64
 2   r            18000 non-null  float64
 3   i            18000 non-null  float64
 4   z            18000 non-null  float64
 5   run          18000 non-null  int64  
 6   camcol       18000 non-null  int64  
 7   field        18000 non-null  int64  
 8   ra           17140 non-null  float64
 9   dec          18000 non-null  float64
 10  object_type  18000 non-null  object 
 11  A            18000 non-null  uint8  
 12  B            18000 non-null  uint8  
 13  C            18000 non-null  uint8  
 14  D            18000 non-null  uint8  
 15  E            18000 non-null  uint8  
 16  F            18000 non-null  uint8  
 17  G            18000 non-null  uint8  
 18  H            18000 non-null  uint8  
 19  I   

In [4]:
# I've noticed that for each value of 'ra', doing 'dec'*'run'/1000 approaches quite well the value of 'ra'.
# -> Imputing NaNs in the 'ra' column with 'dec'*'run'/1000 assuming linear relationship. 

train_set.ra.fillna(abs(train_set.dec*train_set.run)/1000, inplace=True)
test_set.ra.fillna(abs(test_set.dec*test_set.run)/1000, inplace=True)

In [5]:
train_set.head(5)

Unnamed: 0_level_0,u,g,r,i,z,run,camcol,field,ra,dec,...,A,B,C,D,E,F,G,H,I,J
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,18.45886,17.24552,16.65724,16.27922,16.07007,3712,5,198,214.002267,54.488814,...,0,0,0,0,0,0,0,0,1,0
2,18.50274,17.50821,17.19507,17.07868,17.08054,3705,6,131,220.688333,53.185834,...,0,0,0,0,0,0,0,0,1,0
3,18.69451,17.30333,16.48301,16.0557,15.75763,5323,5,140,249.118754,12.091483,...,0,0,0,0,1,0,0,0,0,0
4,19.34373,18.32701,17.97602,17.82627,17.7577,8108,4,57,330.633293,18.920049,...,0,0,1,0,0,0,0,0,0,0
5,19.33328,18.1379,17.5818,17.21341,16.99397,4187,2,107,3.406978,-0.813704,...,0,0,1,0,0,0,0,0,0,0


In [6]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18000 entries, 1 to 18000
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   u            18000 non-null  float64
 1   g            18000 non-null  float64
 2   r            18000 non-null  float64
 3   i            18000 non-null  float64
 4   z            18000 non-null  float64
 5   run          18000 non-null  int64  
 6   camcol       18000 non-null  int64  
 7   field        18000 non-null  int64  
 8   ra           18000 non-null  float64
 9   dec          18000 non-null  float64
 10  object_type  18000 non-null  object 
 11  A            18000 non-null  uint8  
 12  B            18000 non-null  uint8  
 13  C            18000 non-null  uint8  
 14  D            18000 non-null  uint8  
 15  E            18000 non-null  uint8  
 16  F            18000 non-null  uint8  
 17  G            18000 non-null  uint8  
 18  H            18000 non-null  uint8  
 19  I   

In [7]:
X_train = train_set.drop('object_type', axis=1)
y_train = train_set[['object_type']]
X_test = test_set
y_test = pd.DataFrame()

In [8]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state = 42)

clf.fit(X_train, y_train)

  clf.fit(X_train, y_train)


RandomForestClassifier(random_state=42)

In [9]:
y_pred = clf.predict(X_test)

In [10]:
# My predictions
y_pred

array(['STAR', 'GALAXY', 'GALAXY', ..., 'GALAXY', 'STAR', 'STAR'],
      dtype=object)

### Now, I need my predictions to have the same format as the Kaggle one in order to submit them. 

In [11]:
df = {'id': X_test.index, 'object_type': y_pred}

In [12]:
df = pd.DataFrame(df)

In [13]:
df

Unnamed: 0,id,object_type
0,18001,STAR
1,18002,GALAXY
2,18003,GALAXY
3,18004,GALAXY
4,18005,GALAXY
...,...,...
11995,29996,STAR
11996,29997,GALAXY
11997,29998,GALAXY
11998,29999,STAR


In [14]:
df.to_csv("Stan's predictions.csv", index=False)

### This simple random forest gave me an accuracy of : 0.90850 (~91% accuracy)

You can check my `Dataiku work` which gave me a 94% accuracy even quicker.