**Methods/Results**

In [18]:
import pandas as pd
import altair as alt
import numpy as np

url= "https://docs.google.com/spreadsheets/d/e/2PACX-1vROC4kgO6ctTkCjDooBh4Gc_VW7fsUeIgSiPTtcHV0FjFumQclEF8b3ThtxYAJQPyDmRN61OpR4gnpr/pub?output=csv"
pulsar_data = pd.read_csv(url, header= None, names =[
    "integrated_mean",
    "integrated_sd",
    "integrated_xs_kurtosis",
    "integrated_skewness",
    "dmsnr_mean",
    "dmsnr_sd",
    "dmsnr_xs_kurtosis",
    "dmsnr_skewness",
    "class"
],)

pulsar_data["class"]=pulsar_data["class"].replace({
    0: "not pulsar",
    1: "pulsar"
})
pulsar_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         17898 non-null  float64
 1   integrated_sd           17898 non-null  float64
 2   integrated_xs_kurtosis  17898 non-null  float64
 3   integrated_skewness     17898 non-null  float64
 4   dmsnr_mean              17898 non-null  float64
 5   dmsnr_sd                17898 non-null  float64
 6   dmsnr_xs_kurtosis       17898 non-null  float64
 7   dmsnr_skewness          17898 non-null  float64
 8   class                   17898 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.2+ MB


*Fig 1: Summary of Original Data from Pulsar Database*

In [19]:
pulsar_data = pulsar_data.drop(columns=["dmsnr_mean", "dmsnr_sd", "dmsnr_xs_kurtosis", "dmsnr_skewness"])
pulsar_data.head(10)

Unnamed: 0,integrated_mean,integrated_sd,integrated_xs_kurtosis,integrated_skewness,class
0,140.5625,55.683782,-0.234571,-0.699648,not pulsar
1,102.507812,58.88243,0.465318,-0.515088,not pulsar
2,103.015625,39.341649,0.323328,1.051164,not pulsar
3,136.75,57.178449,-0.068415,-0.636238,not pulsar
4,88.726562,40.672225,0.600866,1.123492,not pulsar
5,93.570312,46.698114,0.531905,0.416721,not pulsar
6,119.484375,48.765059,0.03146,-0.112168,not pulsar
7,130.382812,39.844056,-0.158323,0.38954,not pulsar
8,107.25,52.627078,0.452688,0.170347,not pulsar
9,107.257812,39.496488,0.465882,1.162877,not pulsar


*Fig 2: First 10 Rows of the Processed Pulsar Data Set*

In [20]:
from sklearn.model_selection import train_test_split
np.random.seed(1)

pulsar_train, pulsar_test = train_test_split(
    pulsar_data, train_size=0.75, stratify=pulsar_data["class"]
)
pulsar_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13423 entries, 2020 to 12740
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   integrated_mean         13423 non-null  float64
 1   integrated_sd           13423 non-null  float64
 2   integrated_xs_kurtosis  13423 non-null  float64
 3   integrated_skewness     13423 non-null  float64
 4   class                   13423 non-null  object 
dtypes: float64(4), object(1)
memory usage: 629.2+ KB


*Fig 3: Pulsar Training Data Summary*

**Integrated Data Visualization**

**Building the Classifier**

In [42]:
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [43]:
np.random.seed(1)

pulsar_preprocessor = make_column_transformer(
    (StandardScaler(), ["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]),
    verbose_feature_names_out=False
)
pulsar_preprocessor

In [44]:
knn = KNeighborsClassifier()

X = pulsar_train[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]]
y = pulsar_train["class"]

pulsar_fit = make_pipeline(pulsar_preprocessor, knn).fit(X,y)
pulsar_fit

In [26]:
pulsar_pipe = make_pipeline(pulsar_preprocessor, knn)
pulsar_vfold_score = pd.DataFrame(
    cross_validate(
        estimator = pulsar_pipe,
        cv=5,
        X=pulsar_train[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]],
        y=pulsar_train["class"],
        return_train_score=True
    )
)
pulsar_vfold_score

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.022282,0.151408,0.976909,0.981561
1,0.022044,0.128102,0.976536,0.981095
2,0.024196,0.211643,0.978771,0.981561
3,0.021898,0.131505,0.978763,0.979886
4,0.025683,0.141112,0.974292,0.981935


In [27]:
pulsar_metrics = pulsar_vfold_score.agg(["mean","sem"])
pulsar_metrics

Unnamed: 0,fit_time,score_time,test_score,train_score
mean,0.02322,0.152754,0.977054,0.981208
sem,0.000743,0.015272,0.00083,0.000356


In [28]:
param_grid = {
    "kneighborsclassifier__n_neighbors": range(1,21),
}
pulsar_tune_pipe = make_pipeline(pulsar_preprocessor, KNeighborsClassifier())

In [45]:
knn_tune_grid = GridSearchCV(
    estimator = pulsar_tune_pipe,
    param_grid=param_grid,
    cv=5
)
knn_tune_grid

In [46]:
knn_model_grid = knn_tune_grid.fit(X,y)
accuracies_grid = pd.DataFrame(knn_model_grid.cv_results_)
accuracies_grid

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kneighborsclassifier__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022694,0.000548,0.1154,0.001821,1,{'kneighborsclassifier__n_neighbors': 1},0.96648,0.959404,0.964246,0.967958,0.966095,0.964837,0.002963,20
1,0.021811,0.000413,0.140458,0.035622,2,{'kneighborsclassifier__n_neighbors': 2},0.974674,0.970205,0.976164,0.97541,0.971684,0.973627,0.002289,19
2,0.025372,0.005217,0.15109,0.027508,3,{'kneighborsclassifier__n_neighbors': 3},0.975047,0.974302,0.977281,0.97839,0.972802,0.975564,0.002021,18
3,0.033735,0.023501,0.164807,0.074597,4,{'kneighborsclassifier__n_neighbors': 4},0.976536,0.973929,0.978399,0.977273,0.973547,0.975937,0.001895,17
4,0.024088,0.00296,0.127012,0.00182,5,{'kneighborsclassifier__n_neighbors': 5},0.976909,0.976536,0.978771,0.978763,0.974292,0.977054,0.00166,16
5,0.022532,0.000928,0.167339,0.04893,6,{'kneighborsclassifier__n_neighbors': 6},0.976536,0.975791,0.979143,0.979881,0.975037,0.977278,0.001899,15
6,0.025741,0.003455,0.149703,0.018921,7,{'kneighborsclassifier__n_neighbors': 7},0.977281,0.976536,0.979516,0.980626,0.97541,0.977874,0.001923,10
7,0.031568,0.019769,0.16331,0.033472,8,{'kneighborsclassifier__n_neighbors': 8},0.976164,0.976164,0.979516,0.979881,0.97541,0.977427,0.001879,14
8,0.021563,0.000358,0.13167,0.000613,9,{'kneighborsclassifier__n_neighbors': 9},0.976164,0.975791,0.980633,0.979508,0.975782,0.977576,0.002072,12
9,0.024848,0.005259,0.151087,0.024547,10,{'kneighborsclassifier__n_neighbors': 10},0.976536,0.976164,0.980261,0.980253,0.974665,0.977576,0.002277,13


In [47]:
accuracy_versus_k_grid = alt.Chart(accuracies_grid).mark_line(point=True).encode(
    x=alt.X("param_kneighborsclassifier__n_neighbors", title="Neighbors"),
    y=alt.Y("mean_test_score", scale=alt.Scale(domain=(0.964, 0.9785)), title="Accuracy estimate")
)
accuracy_versus_k_grid

Explanation

Test

In [56]:
knn2 = KNeighborsClassifier(n_neighbors=17)
X = pulsar_train[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]]
y = pulsar_train["class"]

pulsar_fit2 = make_pipeline(pulsar_preprocessor, knn2).fit(X,y)
pulsar_fit2

In [57]:
pulsar_test_predictions = pulsar_test.assign(
 predicted = pulsar_fit2.predict(pulsar_test[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]])
)
pulsar_test_predictions[['class']]

Unnamed: 0,class
5642,not pulsar
8983,not pulsar
2409,not pulsar
8402,not pulsar
796,not pulsar
...,...
5443,pulsar
9304,not pulsar
16046,not pulsar
8792,not pulsar


In [63]:
pulsar_acc_1 = pulsar_fit2.score(
    pulsar_test[["integrated_mean", "integrated_sd", "integrated_xs_kurtosis", "integrated_skewness"]],
    pulsar_test["class"]
)
pulsar_acc_1

0.9794413407821229

In [64]:
pd.crosstab(
 pulsar_test_predictions["class"],
 pulsar_test_predictions["predicted"]
)

predicted,not pulsar,pulsar
class,Unnamed: 1_level_1,Unnamed: 2_level_1
not pulsar,4047,18
pulsar,74,336


In [65]:
new_observation = pd.DataFrame({"integrated_mean": [108.92833], "integrated_sd": [38.44983], "integrated_xs_kurtosis": [0.42293], "integrated_skewness": [1.98473],"class":["unknown"]})

In [66]:
prediction = pulsar_fit2.predict(new_observation)
prediction

array(['not pulsar'], dtype=object)