In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Objectives**

From the title of the project, its obvious that the objective is the correct identification of pulsar stars.

This will be a simple prediction and labelling project, with no statistic calculations involved.

**Libraries**

The first step is to load the libraries necessary to view and analyze the data as well as the ones to create the prediction models.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

**Loading and Analysing the Files**

There are two `csv` files:
+ `pulsar_data_test.csv`, with the testing data;
+ `pulsar_data_train.csv`, with the training data;

**Test data**

In [None]:
test_set = pd.read_csv("../input/predicting-pulsar-starintermediate/pulsar_data_test.csv")
test_set.head()

**Train data**

In [None]:
train_set = pd.read_csv("../input/predicting-pulsar-starintermediate/pulsar_data_train.csv")
train_set.head()

**Renaming columns**

In order to be able to easyli manipulate the dataframes, the columns will be renamed.

In [None]:
new_names = ["Mean_IP", "STD_IP", "EK_IP", "SK_IP", "Mean_DMSNR", "STD_DMSNR", "EK_DMSNR", "SK_DMSNR", "target_class"]
test_set.columns = new_names
train_set.columns = new_names

**Missing Data**

The missing data will affect the accuracy of the prediction models, so it's necessary to handle those missing values.

In [None]:
def missing(dataset):
    return dataset.isnull().sum(), msno.matrix(dataset.sample(5370))

**Test set**

In [None]:
missing(test_set)

There are a lot of missing values in three variables (the fourth is the classifications, wich in this dataset is empty), but accordingly to the graph they cannot be simply removed, or the sample size would be greatly reduced.

**Train set**

In [None]:
missing(train_set)

In this set there are a lot more missing values compared to the `test_set`.

Now that we have a sense of how the missing values are distributed, and their dimension, it's important to define a method to tackle this issue.

**Data distribution**

In order to better understand if all of this variables afect the classification of the stars, two models of machine learning will be implemented: one wich all of the variables enter the equation, and one where only the variables considered relevant come into play. 

To do this first we will need to divide the `train_set` into to two, depending on the target class.

In the `test_set`, since the classification is not defined, the column `target_class` will be eliminated.

In [None]:
# dividing the train_set
not_pulsar = train_set[train_set['target_class'] == 0]
#not_pulsar.head()

pulsar = train_set[train_set['target_class'] == 1]
#pulsar.head()

# eliminating the classification column from test_set
test_set = test_set.drop(["target_class"], axis=1)
#test_set.columns

**Calculating Statistics**

To get a better sense of the data that afect the prediction of pulsar stars, a satistics analysis will be perfomed to get a better sense of the data.

Before checking the influence between variables, its important to see the spread of the data. This will be performed using plots to see the means, medians and standard deviations, and to see if there are a considerable amount of outliers.

The graphs will be:
+ histograms
+ box plots

**Means, medians and spread**

In [None]:
# data spread function
class Spread:
    
    def __init__(self, dataset):
        self.dataset = dataset
    
    def sum_stats(self):
        #Each column mean
        print("Average:")
        print(self.dataset.mean())
        print("\n")

        #Each column median
        print("Median:")
        print(self.dataset.median())
        print("\n")

        #Each column standard deviation
        print("Standard Deviation:")
        print(self.dataset.std())
        print("\n")
    
    #histograms
    def hist(self):
        df = self.dataset
        plt.figure(figsize = (20, 15))
        rows = len(df.columns) / 3
        for i in range(1, len(df.columns)):
            plt.subplot(rows, 3, i)
            plt.hist(df[df.columns[i - 1]])
            plt.title(df.columns[i - 1])
            
    #box plots
    def box(self):
        plt.figure(figsize = (15, 10))
        df = self.dataset.drop(["target_class"], axis=1)
        chart = sns.boxplot(data = df)
        chart.set_xticklabels(chart.get_xticklabels(), rotation = "vertical")
        plt.title("Box Plots")

        
#overlapping histograms
def over_hist(df1, df2):
    plt.figure(figsize = (20, 15))
    for i in range(1, len(df1.columns)):
        plt.subplot(len(df1.columns) / 3, 3, i)
        plt.hist(df1[df1.columns[i - 1]], alpha = 0.5)
        plt.hist(df2[df2.columns[i - 1]], alpha = 0.5)
        plt.title(df1.columns[i - 1])
        plt.legend(["Pulsar star", "Non pulsar star"])

**Pulsar set**

In [None]:
pulsar_stats = Spread(pulsar)
pulsar_stats.sum_stats()
pulsar_stats.hist()
pulsar_stats.box()

**Non pulsar set**

In [None]:
nonpulsar_stats = Spread(not_pulsar)
nonpulsar_stats.sum_stats()
nonpulsar_stats.hist()
nonpulsar_stats.box()

In [None]:
# overlapping histograms
over_hist(pulsar, not_pulsar)

As we can see, all of the variables have a considerable diference between pulsar and non pulsar stars.

To build the prediction model all varaiables will be considered, wich means that only one model of supervised machine learning will be built.

Before building the model there are two important task to be performed, the fullfilment of the missing values, and the normalization of the values.

The last task will allow for a better model, in the sense that will equallize the weight of all the variables.

**Filling the missing values**

There are several ways to fill the missing values, but the one that seemed the most adequate for this particular dataset is imputation using k-NN. This method uses machine learning to fill the missing values based on other values with the most similar characteristics.

Although is the best method, it as a disadvantage, it's sensitive to outliers. Considering that the columns with the missing values have outliers, these do not disperse away from the core of the values, allowing for a better result.

In [None]:
#filling missin data with imputation
def fill_miss(dataset, n):
    imputer = KNNImputer(n_neighbors = n)
    imputed = imputer.fit_transform(dataset)
    df = pd.DataFrame(data=imputed, columns=pulsar.columns)
    print(df.isnull().sum())
    return df

**Train set**

In [None]:
train_set_filled = fill_miss(train_set, 3)

**Test set**

In [None]:
imputer = KNNImputer(n_neighbors = 3)
imputed = imputer.fit_transform(test_set)
test_set_filled = pd.DataFrame(imputed, columns = test_set.columns)
test_set_filled.isnull().sum()

**Normalizing data**

In order to allow the variables to have the simular weight in the prediction model, it's necessary to normalize the values.

Analysing the box plots its possible to see that the presence of outliers is quite substancial, meaning that the min-max normalization is not the most sensitive method. On the other hand the values from standard deviation range from 0.33 up to 107, meaning that the weight diference wil not be greatly reduced.

After considereing these facts the option is to go for the min-max normalization method.

In [None]:
def minmax(dataset):
    columns = dataset.columns
    new_arr = []
    for column in columns:
        norm = []
        min_val = dataset[column].min()
        max_val = dataset[column].max()
        for i in range(dataset.shape[0]):
            new_val = (dataset[column][i] - min_val) / (max_val - min_val)
            norm.append(new_val)
        new_arr.append(norm)
        
    new_df = pd.DataFrame(np.transpose(new_arr), columns = columns)
    if len(new_df.columns) == 9:
        new_df["target_class"] = dataset["target_class"]
        
    return new_df

In [None]:
train_set = minmax(train_set_filled)
test_set = minmax(test_set_filled)

**Machine Learning Model**

We come to the last step of this project, the creation of the prediction model.

The model that best fits this case is the K-Nearest Neighbors.

In [None]:
def predt(training_points, training_labels, k, unlabelled):
    classifier = KNeighborsClassifier(n_neighbors = k)
    classifier.fit(training_points, training_labels)
    predictions = classifier.predict(unlabelled)
    
    return predictions

def bestk(training_points, training_labels, test_points, test_labels):
    
    accuracies = []
    for k in range(1, 100, 2):
        classifier = KNeighborsClassifier(n_neighbors = k)
        classifier.fit(training_points, training_labels)
        accuracies.append(classifier.score(test_points, test_labels))
        
    plt.plot(range(len(accuracies)), accuracies)
    plt.xlabel("k")
    plt.ylabel("Score")
    plt.title("Best k")
    
    k_best = accuracies.index(max(accuracies))
    
    classifier = KNeighborsClassifier(n_neighbors = k_best)
    classifier.fit(training_points, training_labels)
    guesses = classifier.predict(test_points)
    accuracy = accuracy_score(test_labels, guesses)
    recall = recall_score(test_labels, guesses)
    precision = precision_score(test_labels, guesses)
    f1 = f1_score(test_labels, guesses)
    
    print("Score:", max(accuracies))
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("F1:", f1)
    print("Best k: {bestk}".format(bestk = k_best))
    
    return k_best

In [None]:
train_points, test_points, tr_labels, tt_labels = train_test_split(train_set[["Mean_IP", "STD_IP", "EK_IP", "SK_IP", "Mean_DMSNR", "STD_DMSNR", "EK_DMSNR", "SK_DMSNR"]],
                                                                        train_set["target_class"], test_size = 0.2,
                                                                        random_state = 1)

best_k = bestk(train_points, tr_labels, test_points, tt_labels)

Using this model the results obtained are very satisfactory, providing a score of 98%.

The best number of neighbors obtained is 9, wich will be used to make the predictions in the `test_set`.

**Classifying test set**

In [None]:
classifier = predt(train_points, tr_labels, best_k, test_set)
test_set["target_class"] = classifier
test_set