In [30]:
# This is a cleaned up version of my code to teach/present with
# and/or be used as a template for learning

# This allows you to do absolute paths to files This method returns your current working directory
# This can be joined with the relative path to use absolute paths to files 
# (helpful with buggy pickles but we'll get to that later)
import os
here = os.getcwd()

# I use pandas dataframes to store my data.abs
# Just think of dataframes as a really convenient way to store 2D arrays
# These dataframes give us helpful methods to manipulate data as well

# as pd lets us abbriviate so we don't have to type pandas everytime we use one of its methods
import pandas as pd 

# This opens up the csv file HTRU_2.csv and saves it into a dataframe that we can use to manipulate
# this dataframe can be downloaded from the following link: 
# https://www.kaggle.com/charitarth/pulsar-dataset-htru2
# The folks that created this dataset requested if we use it in work to cite the following:
"""
R. J. Lyon, B. W. Stappers, S. Cooper, J. M. Brooke, J. D. Knowles, Fifty Years of Pulsar
	Candidate Selection: From simple filters to a new principled real-time classification approach
	MNRAS, 2016.
"""
data_frame = pd.read_csv(os.path.join(here, 'HTRU_2.csv'))

# .head() lets us look at the structure of the dataframe and the first 5 rows
data_frame.head() 

Unnamed: 0,140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [31]:
# Notice that there are no column names 
# Pandas just used the first line of data as the column headers. That's no good

# We need to add meaningful headers that explain what each column is representing
data_frame.columns =['Mean of Int. Prof.', 'Stand. Deviation of Int. Prof.', 
                     'Excess Kurtosis of Int. Prof.', 'Skewness of Int. Prof.',
                     'Mean of Curve', ' Stand. Deviation of Curve', 'Excess Kurtosis of Curve',
                     'Skewness of Curve', 'Class']

# Now that we added column headers lets look at the header
data_frame.head()

Unnamed: 0,Mean of Int. Prof.,Stand. Deviation of Int. Prof.,Excess Kurtosis of Int. Prof.,Skewness of Int. Prof.,Mean of Curve,Stand. Deviation of Curve,Excess Kurtosis of Curve,Skewness of Curve,Class
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [32]:
# This looks much better! 

# Now, we need to make sure our dataset is clean so it can make a propper model
# We need to make sure that there are no duplicates and that there is no missing data
# Pandas has handy methods to help us check real quick

# First let's make sure there are no duplicates in our data
# Let's check the shape of the current dataframe
data_frame.shape

(17897, 9)

In [33]:
# This means there are 9 columns and 17,897 Columns
# Now let's drop all duplicate rows and see if the shape changes
data_frame.drop_duplicates()
data_frame.shape

(17897, 9)

In [34]:
# Same shape, that means there were no duplicate rows in the original dataset
# If you have a dataset with duplicates, after running drop_duplicates() 
# the dataframe will be smaller, but you can use the smaller dataframe to train your model

# Now, let's check if there are any fields with missing data

# the isnull() method indicates wheter or not values are missing
# .sum() counts the number of times in the dataframe that isnull() is true
# So basically this line of code counts the number of times there is missing data in our dataframe
data_frame.isnull().sum() 

Mean of Int. Prof.                0
Stand. Deviation of Int. Prof.    0
Excess Kurtosis of Int. Prof.     0
Skewness of Int. Prof.            0
Mean of Curve                     0
 Stand. Deviation of Curve        0
Excess Kurtosis of Curve          0
Skewness of Curve                 0
Class                             0
dtype: int64

In [50]:
# There is no missing data in any of the columns. YAY!
# If you run into a dataset that does have missing data
# you can use the pandas method .dropnull() or dropna()

# Let's look at some other useful statistics on our data

# Let's see how many pulsars vs non-pulsars there are in the dataframe

# To break down what is done in these 2 lines
# data_frame[data_frame.Class == 0] returns
# a dataframe including all of the rows with a class 0
# len simply returns the length of such a dataframe

# a class of 0 is non pulsar and 1 is pulsar
print('Number of Non-Pulsars: ' + str(len(data_frame[data_frame.Class == 0])))
print('Number of Pulsars:     ' + str(len(data_frame[data_frame.Class == 1])))
print('Ratio of Pulsars: ' + str(1639/(16258+1639)))

Number of Non-Pulsars: 16258
Number of Pulsars:     1639
Ratio of Pulsars: 0.09157959434542103


In [20]:
# There is an imbalance here. We need to be carefull that our model
# does not bias non_pulsars too much
# we will likely see more missed predictions of pulsars becasue of this

# Since we know our dataframe is clean data, we can now start splitting it
# up so we can start training Machine Learning Models off of it

# We need to separate our dataframe into input and output
# In our case, columns 1-8 are inputs and the output is the last column (Class)

# So, we need to put the first 8 columns into a dataframe and the last one into another dataframe
# There may be a better way to do this, but my approach was to make 2 copies of the dataframe
# and then drop the unwanted columns

# x will represent the input dataframe and y will represent the output dataframe

# axis 1 represents the columns (obviously axis 0 will represent the rows)
# inplace determines whether or not what is returned is a modified copy or 
# if the operation is done on the original dataframe
# since we want to save a modified copy we set inplace=False
x = data_frame.drop(['Class'], axis=1, inplace=False)

# Now let's do the same thing for the outputs
y = data_frame.drop(['Mean of Int. Prof.', 'Stand. Deviation of Int. Prof.', 
                     'Excess Kurtosis of Int. Prof.', 'Skewness of Int. Prof.',
                     'Mean of Curve', ' Stand. Deviation of Curve', 'Excess Kurtosis of Curve',
                     'Skewness of Curve'], axis=1, inplace=False)
# Let's look at the shape and see if they match our expectations
print(x.shape)
print(y.shape)

(17897, 8)
(17897, 1)


In [18]:
# Let's look at the heads to see what these operations did
x.head()

Unnamed: 0,Mean of Int. Prof.,Stand. Deviation of Int. Prof.,Excess Kurtosis of Int. Prof.,Skewness of Int. Prof.,Mean of Curve,Stand. Deviation of Curve,Excess Kurtosis of Curve,Skewness of Curve
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004


In [19]:
y.head()

Unnamed: 0,Class
0,0
1,0
2,0
3,0
4,0


In [26]:
# As expected now x contains all of the inputs and y contains all of the outputs

# The next step is to separate our data into a training set and a testing set
# this way we can train our model and then test it on new data it hasn't seen

# sklearn has a really nice method that does this for us in 1 line
from sklearn.model_selection import train_test_split

# This method returns 4 datasets 2 inputs and 2 outputs
# The variable test_size determines what percentage of the dataframe
# is used for test. In this case we used 20% for testing
# which leaves 80% for training
# random_state is just the way that it shuffles data before splitting it up
# The documentation said 42 is common so that is the only reason I chose it

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state=42)

# Let's look at the size of the different dataframes to see what the function did
print((x_train.size)//8) # divide by 8 because there are 8 columns (// is integer division in python)
print(y_train.size)
print((x_test.size)//8) # divide by 8 because there are 8 columns
print(y_test.size)

14317
14317
3580
3580


In [49]:
# let's make sure the test and train sets contain a similar ratio of pulsars to non pulsars
non_pulsar_train = len(y_train[y_train.Class == 0])
pulsar_train = len(y_train[y_train.Class == 1])
percent_pusar_train = pulsar_train/(non_pulsar_train+pulsar_train)

non_pulsar_test = len(y_test[y_test.Class == 0])
pulsar_test = len(y_test[y_test.Class == 1])
percent_pusar_test = pulsar_test/(non_pulsar_test+pulsar_test)

print("TRAINING SET STATS:")
print('Number of Non-Pulsars=' + str(non_pulsar_train))
print('Number of Pulsars=' + str(pulsar_train))
print('Ratio of Pulsars=' + str(percent_pusar_train))
print('\nTESTING SET STATS:')
print('Number of Non-Pulsars=' + str(non_pulsar_test))
print('Number of Pulsars=' + str(pulsar_test))
print('Ratio of Pulsars=' + str(percent_pusar_test))

TRAINING SET STATS:
Number of Non-Pulsars=12999
Number of Pulsars=1318
Ratio of Pulsars=0.09205839212125445

TESTING SET STATS:
Number of Non-Pulsars=3259
Number of Pulsars=321
Ratio of Pulsars=0.08966480446927375


In [52]:
# Both have a good distribution very close to each other as well
# as the overall original's distribution

# Now that we have input and output can start training our models

# We will use models from the scikit learn library
# scikit learn also offers helpful metrics that will help us visualize the performance of our model

# acuracy score tells us the percentage of correct predictions our model made
from sklearn.metrics import accuracy_score

# Confusion matrix helps us visullize number of guesses that were right/wrong in each category
from sklearn.metrics import confusion_matrix

# f1 score summarizes the accuracy of true positives/ false positives/ true negatives/ false negatives
# basically a percentage version of the confusion matrix
from sklearn.metrics import f1_score

In [53]:
# there are a ton of different modles we can use from sklearn
# We will only focus on neural networks for this notebook
# The process is pretty much the same for all of them, you just have to 
# reference the online documentation to see what kinds of parameters are availible to you

# import the model from sklearn
# The documentation for MLPClassifier can be found here
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
from sklearn.neural_network import MLPClassifier

# create the model with whatever parameters you want to try
# for now I will just set the number and size of the hidden layers created
# If you want to mess with other parameters, reference the documentation
# This is 3 hidden layers of 10 hidden nodes each
neural_network = MLPClassifier(hidden_layer_sizes=(10,10,10))


# This line trains the model using the input and output of the training sets we separated before
neural_network.fit(x_train, y_train)

# This line uses the model we just created to predict the outputs of the test set
y_predict = neural_network.predict(x_test)

# now we can use the predicted outputs and the actual outputs to see how our model did
# we will look at the performance with the sklearn metrics we imported before
print("Accuracy Score:", end = " ")
print(accuracy_score(y_test, y_predict))
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_predict))
print("F1 Score:", end =" ")
print(f1_score(y_test, y_predict, average=None))

  return f(*args, **kwargs)
Accuracy Score: 0.9782122905027933
Confusion Matrix: 
[[3232   27]
 [  51  270]]
F1 Score: [0.98807704 0.87378641]


In [60]:
# 97% accuracy is pretty good

# we can try other parameters to see if they do better
# It is almost impossible to try all combinations by hand
# there is a helpful method that lets us try different models 

# Grid search
from sklearn.model_selection import GridSearchCV

# Here we are comparing the 4 different values of activation and 3 different values for solver
# Gridsearch will compare all possible combinations of these and return the best
# The parameters are the model you want to use, a list of parameters you want to check, 
# cv and others you can look into
# cv=5 basically splits the test data into 5 different tests and then it takes the average to rank them
grid_search = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10,10,10)),{
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam']
}, cv=5, return_train_score=False)

grid_search.fit(x_train, y_train)


  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as sh

GridSearchCV(cv=5, estimator=MLPClassifier(hidden_layer_sizes=(10, 10, 10)),
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'solver': ['lbfgs', 'sgd', 'adam']})

In [61]:
# Easier to look at results if you save them to a dataframe (so we can manipulate the data with pandas mehtods)
grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.380841,0.234044,0.002592,0.000489,identity,lbfgs,"{'activation': 'identity', 'solver': 'lbfgs'}",0.980098,0.976955,0.980091,0.974502,0.979043,0.978138,0.002149,1
1,1.472089,1.280269,0.003067,0.000138,identity,sgd,"{'activation': 'identity', 'solver': 'sgd'}",0.978003,0.967877,0.974852,0.908138,0.973454,0.960465,0.026368,11
2,1.068559,0.42084,0.002968,0.000574,identity,adam,"{'activation': 'identity', 'solver': 'adam'}",0.979399,0.973813,0.972756,0.973105,0.967866,0.973388,0.003669,8
3,2.859383,0.270557,0.003969,0.000112,logistic,lbfgs,"{'activation': 'logistic', 'solver': 'lbfgs'}",0.977654,0.915154,0.976947,0.97066,0.982187,0.96452,0.024955,10
4,1.513965,1.687011,0.003262,0.000573,logistic,sgd,"{'activation': 'logistic', 'solver': 'sgd'}",0.907821,0.907821,0.908138,0.908138,0.907789,0.907942,0.000161,12


In [65]:
# this gives us a bunch of helpful data
# We can sort based on the rank of the different tests so it is in order of best to worst combinations
# by tells us which column value we want to sort by
# axis tells us which we are sorting (we want to sort the rows)
# ascending is self explanitory
grid_search_results = grid_search_results.sort_values(by=['rank_test_score'], axis=0, ascending=True)
grid_search_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.380841,0.234044,0.002592,0.000489,identity,lbfgs,"{'activation': 'identity', 'solver': 'lbfgs'}",0.980098,0.976955,0.980091,0.974502,0.979043,0.978138,0.002149,1
6,3.210226,0.232432,0.004405,0.00047,tanh,lbfgs,"{'activation': 'tanh', 'solver': 'lbfgs'}",0.978352,0.974162,0.976947,0.974502,0.98044,0.976881,0.002362,2
5,4.49372,0.859173,0.003664,0.000376,logistic,adam,"{'activation': 'logistic', 'solver': 'adam'}",0.980447,0.973115,0.979392,0.973454,0.977646,0.976811,0.003017,3
11,1.988195,0.994426,0.002858,0.000461,relu,adam,"{'activation': 'relu', 'solver': 'adam'}",0.97905,0.972416,0.974153,0.974502,0.975899,0.975204,0.00222,4
8,3.262109,0.680183,0.004001,2.8e-05,tanh,adam,"{'activation': 'tanh', 'solver': 'adam'}",0.979749,0.97102,0.974852,0.975201,0.972756,0.974715,0.002936,5


In [68]:
# if you wanted to save this dataframe use the following
# you could use this to do further study on the results
grid_search_results.to_csv('tutorialGridSearch.csv', index=False)

In [69]:
# The last thing I want to look at in this tutorial is the library pickle
# This allows you to save trained models to your hard drive
# you can use this to save the best models that could be used for further use
import pickle

# this is just creating a file to write to and dumping the pickle
with open(os.path.join(here, 'tutorialGridsearch.pkl'), 'wb') as f:
    pickle.dump(neural_network, f)

In [70]:
# to load a pickle do the following
with open(os.path.join(here, 'tutorialGridsearch.pkl'), 'rb') as f:
    pickled_model = pickle.load(f)

In [71]:
# so directly from loading we can use it to predict again
y_predict = pickled_model.predict(x_test)

accuracy_score(y_test, y_predict)

0.9782122905027933