**Methods/Results**

In [18]:
import pandas as pd
import altair as alt
import numpy as np

url= "https://docs.google.com/spreadsheets/d/e/2PACX-1vROC4kgO6ctTkCjDooBh4Gc_VW7fsUeIgSiPTtcHV0FjFumQclEF8b3ThtxYAJQPyDmRN61OpR4gnpr/pub?output=csv"
pulsar_data = pd.read_csv(url, header= None, names =[
    "integrated_mean",
    "integrated_sd",
    "integrated_xs_kurtosis",
    "integrated_skewness",
    "dmsnr_mean",
    "dmsnr_sd",
    "dmsnr_xs_kurtosis",
    "dmsnr_skewness",
    "class"
],)

pulsar_data["class"]=pulsar_data["class"].replace({
    0: "not pulsar",
    1: "pulsar"
})
pulsar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         17898 non-null  float64
 1   integrated_sd           17898 non-null  float64
 2   integrated_xs_kurtosis  17898 non-null  float64
 3   integrated_skewness     17898 non-null  float64
 4   dmsnr_mean              17898 non-null  float64
 5   dmsnr_sd                17898 non-null  float64
 6   dmsnr_xs_kurtosis       17898 non-null  float64
 7   dmsnr_skewness          17898 non-null  float64
 8   class                   17898 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.2+ MB


*Fig 1: Summary of Original Data from Pulsar Database*

In [19]:
pulsar_data = pulsar_data.drop(columns=["dmsnr_mean", "dmsnr_sd", "dmsnr_xs_kurtosis", "dmsnr_skewness"])
pulsar_data.head(10)

Unnamed: 0,integrated_mean,integrated_sd,integrated_xs_kurtosis,integrated_skewness,class
0,140.5625,55.683782,-0.234571,-0.699648,not pulsar
1,102.507812,58.88243,0.465318,-0.515088,not pulsar
2,103.015625,39.341649,0.323328,1.051164,not pulsar
3,136.75,57.178449,-0.068415,-0.636238,not pulsar
4,88.726562,40.672225,0.600866,1.123492,not pulsar
5,93.570312,46.698114,0.531905,0.416721,not pulsar
6,119.484375,48.765059,0.03146,-0.112168,not pulsar
7,130.382812,39.844056,-0.158323,0.38954,not pulsar
8,107.25,52.627078,0.452688,0.170347,not pulsar
9,107.257812,39.496488,0.465882,1.162877,not pulsar


*Fig 2: First 10 Rows of the Processed Pulsar Data Set*

In [20]:
from sklearn.model_selection import train_test_split
np.random.seed(1)

pulsar_train, pulsar_test = train_test_split(
    pulsar_data, train_size=0.75, stratify=pulsar_data["class"]
)
pulsar_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 2020 to 12740
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         13423 non-null  float64
 1   integrated_sd           13423 non-null  float64
 2   integrated_xs_kurtosis  13423 non-null  float64
 3   integrated_skewness     13423 non-null  float64
 4   class                   13423 non-null  object 
dtypes: float64(4), object(1)
memory usage: 629.2+ KB


*Fig 3: Pulsar Training Data Summary*

**Integrated Data Visualization**

**Building the Classifier**

In [23]:
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [24]:
np.random.seed(1)

pulsar_preprocessor = make_column_transformer(
    (StandardScaler(), ["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]),
    verbose_feature_names_out=False
)
pulsar_preprocessor

In [25]:
knn = KNeighborsClassifier()

X = pulsar_train[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]]
y = pulsar_train["class"]

pulsar_fit = make_pipeline(pulsar_preprocessor, knn).fit(X,y)
pulsar_fit

In [26]:
pulsar_pipe = make_pipeline(pulsar_preprocessor, knn)
pulsar_vfold_score = pd.DataFrame(
    cross_validate(
        estimator = pulsar_pipe,
        cv=5,
        X=pulsar_train[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]],
        y=pulsar_train["class"],
        return_train_score=True
    )
)
pulsar_vfold_score


Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.022282,0.151408,0.976909,0.981561
1,0.022044,0.128102,0.976536,0.981095
2,0.024196,0.211643,0.978771,0.981561
3,0.021898,0.131505,0.978763,0.979886
4,0.025683,0.141112,0.974292,0.981935


In [27]:
pulsar_metrics = pulsar_vfold_score.agg(["mean","sem"])
pulsar_metrics

Unnamed: 0,fit_time,score_time,test_score,train_score
mean,0.02322,0.152754,0.977054,0.981208
sem,0.000743,0.015272,0.00083,0.000356


In [28]:
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1,21),
}
pulsar_tune_pipe = make_pipeline(pulsar_preprocessor, KNeighborsClassifier())

In [29]:
knn_tune_grid = GridSearchCV(
    estimator = pulsar_tune_pipe,
    param_grid=param_grid,
    cv=4
)
knn_tune_grid

In [30]:
knn_model_grid = knn_tune_grid.fit(X,y)
accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.030555,0.011809,0.159356,0.011835,1,{'kneighborsclassifier__n_neighbors': 1},0.968117,0.960667,0.968117,0.964232,0.965283,0.003101,20
1,0.037729,0.0238,0.213973,0.035797,2,{'kneighborsclassifier__n_neighbors': 2},0.97497,0.969309,0.979142,0.969896,0.973329,0.004014,19
2,0.027199,0.010294,0.162238,0.009667,3,{'kneighborsclassifier__n_neighbors': 3},0.976162,0.971692,0.98093,0.971088,0.974968,0.003961,18
3,0.023044,0.003752,0.206848,0.078357,4,{'kneighborsclassifier__n_neighbors': 4},0.977354,0.972884,0.980632,0.97228,0.975788,0.003415,17
4,0.020796,0.000414,0.163691,0.006416,5,{'kneighborsclassifier__n_neighbors': 5},0.977056,0.974076,0.981824,0.97377,0.976682,0.003234,15
5,0.024022,0.003142,0.235139,0.079598,6,{'kneighborsclassifier__n_neighbors': 6},0.977354,0.973182,0.981228,0.973174,0.976235,0.003349,16
6,0.020462,0.000535,0.173263,0.00211,7,{'kneighborsclassifier__n_neighbors': 7},0.977354,0.97497,0.982122,0.973472,0.97698,0.003276,14
7,0.020476,8.7e-05,0.1693,0.003708,8,{'kneighborsclassifier__n_neighbors': 8},0.977056,0.975268,0.983313,0.974069,0.977427,0.003561,13
8,0.020797,0.000375,0.171437,0.004931,9,{'kneighborsclassifier__n_neighbors': 9},0.976758,0.975566,0.983909,0.974963,0.977799,0.003586,10
9,0.020617,0.000268,0.166679,0.001393,10,{'kneighborsclassifier__n_neighbors': 10},0.977652,0.975566,0.983313,0.97377,0.977576,0.003586,12


In [34]:
accuracy_versus_k_grid = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors", title="Neighbors"),
    y=alt.Y("mean_test_score", scale=alt.Scale(domain=(0.96, 0.98)), title="Accuracy estimate")
)
accuracy_versus_k_grid