## Dividing features according to their inherent usefulness

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%cd "Desktop"
dataset = pd.read_csv('dataset.csv')
#The inner square brackets define a Python list with column names. 
#The outer brackets are used to select the data from a pandas DataFrame.
useful_features=dataset[["u", "g",'r','i','z','redshift']]
semi_features=dataset[['ra','dec','fiberid','mjd']]
non_useful_features=dataset[['camcol','rerun','field','specobjid','run','plate']]
df=pd.DataFrame(useful_features)
# fill missing values
df = df.fillna(df.median()).clip(-1e11,1e11)
df.head()

[WinError 2] The system cannot find the file specified: 'Desktop'
C:\Users\Farah\Desktop


Unnamed: 0,u,g,r,i,z,redshift
0,19.47406,17.0424,15.94699,15.50342,15.22531,-9e-06
1,18.6628,17.21449,16.67637,16.48922,16.3915,-5.5e-05
2,19.38298,18.19169,17.47428,17.08732,16.80125,0.123111
3,17.76536,16.60272,16.16116,15.98233,15.90438,-0.000111
4,17.55025,16.26342,16.43869,16.55492,16.61326,0.00059


### will perform grid search on the following combinations of features:
* Useful
* useful+semi-useful
* All features

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *

In [5]:
dataset['class'] = dataset['class'].replace(['GALAXY','STAR','QSO'],[0,1,-1]) 
y = dataset['class']
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.preprocessing import StandardScaler 
scale = StandardScaler()
# estimators/models
clf1 = RandomForestClassifier(random_state=42)
clf2 = SVC(probability=True, random_state=42)
clf3 = LogisticRegression(random_state=42)
clf4 = DecisionTreeClassifier(random_state=42)
clf5 = KNeighborsClassifier()

In [6]:
# parameters per dictionary
param1 = {}
param1['classifier__n_estimators'] = [10, 50, 100, 250]
param1['classifier__max_depth'] = [5, 10, 20]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param2['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param2['classifier'] = [clf2]

param3 = {}
param3['classifier__C'] = [10**-2, 10**-1, 10**0, 10**1, 10**2]
param3['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param3['classifier'] = [clf3]

param4 = {}
param4['classifier__max_depth'] = [5,10,25,None]
param4['classifier__min_samples_split'] = [2,5,10]
param4['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param4['classifier'] = [clf4]

param5 = {}
param5['classifier__n_neighbors'] = [2,5,10,25,50]
param5['classifier'] = [clf5]

In [7]:
pipeline = Pipeline([('scale',scale) , ('classifier', clf1)])
params = [param1, param2, param3, param4, param5]

In [8]:
%%time
gs = RandomizedSearchCV(pipeline, params, cv=3, n_jobs=-1, error_score='raise').fit(X_train, y_train)

Wall time: 1min 25s


In [9]:
gs.best_score_

0.9886246871054096

In [10]:
gs.best_params_

{'classifier__n_estimators': 10,
 'classifier__max_depth': 20,
 'classifier__class_weight': {0: 1, 1: 10},
 'classifier': RandomForestClassifier(class_weight={0: 1, 1: 10}, max_depth=20,
                        n_estimators=10, random_state=42)}

In [12]:
predict= gs.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predict)

0.9905

# Useful+semi

In [14]:
useful_and_semi_features=dataset[['ra','dec','fiberid','mjd',"u", "g",'r','i','z','redshift']]
df=pd.DataFrame(useful_and_semi_features)
# fill missing values
df = df.fillna(df.median()).clip(-1e11,1e11)

In [15]:
y = dataset['class']
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
%%time
gs = RandomizedSearchCV(pipeline, params, cv=3, n_jobs=-1, error_score='raise').fit(X_train, y_train)

Wall time: 31.3 s


In [17]:
gs.best_score_

0.9886246402245481

In [18]:
gs.best_params_

{'classifier__n_estimators': 50,
 'classifier__max_depth': 10,
 'classifier__class_weight': {0: 1, 1: 5},
 'classifier': RandomForestClassifier(class_weight={0: 1, 1: 5}, max_depth=10, n_estimators=50,
                        random_state=42)}

In [19]:
predict= gs.predict(X_test)
accuracy_score(y_test, predict)

0.9915

# All features

In [20]:
all_features=dataset[['ra','dec','fiberid','mjd',"u", "g",'r','i','z','redshift','camcol','rerun','field','specobjid','run','plate']]
df=pd.DataFrame(all_features)
# fill missing values
df = df.fillna(df.median()).clip(-1e11,1e11)

In [21]:
y = dataset['class']
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
%%time
gs = RandomizedSearchCV(pipeline, params, cv=3, n_jobs=-1, error_score='raise').fit(X_train, y_train)

Wall time: 34.8 s


In [23]:
gs.best_score_

0.9886247339862712

In [24]:
gs.best_params_

{'classifier__n_estimators': 100,
 'classifier__max_depth': 20,
 'classifier__class_weight': {0: 1, 1: 25},
 'classifier': RandomForestClassifier(class_weight={0: 1, 1: 25}, max_depth=20,
                        random_state=42)}

In [25]:
predict= gs.predict(X_test)
accuracy_score(y_test, predict)

0.9905