**Methods/Results**

In [10]:
import pandas as pd
import altair as alt
import numpy as np

url= "https://docs.google.com/spreadsheets/d/e/2PACX-1vROC4kgO6ctTkCjDooBh4Gc_VW7fsUeIgSiPTtcHV0FjFumQclEF8b3ThtxYAJQPyDmRN61OpR4gnpr/pub?output=csv"
pulsar_data = pd.read_csv(url, header= None, names =[
    "integrated_mean",
    "integrated_sd",
    "integrated_xs_kurtosis",
    "integrated_skewness",
    "dmsnr_mean",
    "dmsnr_sd",
    "dmsnr_xs_kurtosis",
    "dmsnr_skewness",
    "class"
],)

pulsar_data["class"]=pulsar_data["class"].replace({
    0: "not pulsar",
    1: "pulsar"
})
pulsar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         17898 non-null  float64
 1   integrated_sd           17898 non-null  float64
 2   integrated_xs_kurtosis  17898 non-null  float64
 3   integrated_skewness     17898 non-null  float64
 4   dmsnr_mean              17898 non-null  float64
 5   dmsnr_sd                17898 non-null  float64
 6   dmsnr_xs_kurtosis       17898 non-null  float64
 7   dmsnr_skewness          17898 non-null  float64
 8   class                   17898 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.2+ MB


*Fig 1: Summary of Original Data from Pulsar Database*

In [11]:
pulsar_data = pulsar_data.drop(columns=["dmsnr_mean", "dmsnr_sd", "dmsnr_xs_kurtosis", "dmsnr_skewness"])
pulsar_data.head(10)

Unnamed: 0,integrated_mean,integrated_sd,integrated_xs_kurtosis,integrated_skewness,class
0,140.5625,55.683782,-0.234571,-0.699648,not pulsar
1,102.507812,58.88243,0.465318,-0.515088,not pulsar
2,103.015625,39.341649,0.323328,1.051164,not pulsar
3,136.75,57.178449,-0.068415,-0.636238,not pulsar
4,88.726562,40.672225,0.600866,1.123492,not pulsar
5,93.570312,46.698114,0.531905,0.416721,not pulsar
6,119.484375,48.765059,0.03146,-0.112168,not pulsar
7,130.382812,39.844056,-0.158323,0.38954,not pulsar
8,107.25,52.627078,0.452688,0.170347,not pulsar
9,107.257812,39.496488,0.465882,1.162877,not pulsar


*Fig 2: First 10 Rows of the Processed Pulsar Data Set*

In [12]:
from sklearn.model_selection import train_test_split
np.random.seed(1)

pulsar_train, pulsar_test = train_test_split(
    pulsar_data, train_size=0.75, stratify=pulsar_data["class"]
)
pulsar_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 2020 to 12740
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         13423 non-null  float64
 1   integrated_sd           13423 non-null  float64
 2   integrated_xs_kurtosis  13423 non-null  float64
 3   integrated_skewness     13423 non-null  float64
 4   class                   13423 non-null  object 
dtypes: float64(4), object(1)
memory usage: 629.2+ KB


*Fig 3: Pulsar Training Data Summary*

**Integrated Data Visualization**

**Building the Classifier**

In [13]:
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

pulsar_preprocessor = make_column_transformer(
 (StandardScaler(), ["integrated_mean", "integrated_sd","integrated_xs_kurtosis", "integrated_skewness"]),
    remainder='passthrough',
    verbose_feature_names_out=False
)

knn = KNeighborsClassifier()
pulsar_pipe = make_pipeline(pulsar_preprocessor, knn)
parameter_grid = {
 "kneighborsclassifier__n_neighbors": range(1, 100, 5),
}

from sklearn.model_selection import GridSearchCV
pulsar_grid = GridSearchCV(
 estimator=pulsar_pipe,
 param_grid=parameter_grid,
 cv=10
)

pulsar_accuracies_grid = pd.DataFrame(
 pulsar_grid.fit(
 pulsar_train[["integrated_mean", "integrated_sd","integrated_xs_kurtosis", "integrated_skewness"]],
 pulsar_train["class"]
 ).cv_results_
)

pulsar_accuracy_vs_k = alt.Chart(pulsar_accuracies_grid).mark_line(point=True).encode(
 x=alt.X("param_kneighborsclassifier__n_neighbors").title("Neighbors"),
 y=alt.Y("mean_test_score")
 .scale(domain=(0.60, 0.90))
 .title("Accuracy estimate")
)
pulsar_accuracy_vs_k

TypeError: 'UndefinedType' object is not callable