In [None]:
'''Predicting Pulsar Stars'''
#binary classification algorithm

#Load Models
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, RepeatedStratifiedKFold

#shows visualization in line -> replaces plt.show
%matplotlib inline

In [None]:
#IMPORTING DATA

data = pd.read_csv('Datasets/pulsar_star_dataset/pulsar_data_train.csv')


In [None]:
'''DATA TREATMENT'''
#describes testing set shape, null values anda data info
  
print("---------------------------------------------------")
print("Dataset's Shape: ", data.shape)

print("--------------------------------------------------- ")
print("Null Values: ")
print(data.isna().sum())

print("--------------------------------------------------- ")
print("Data Info: ")
print(data.info())


In [None]:
#As all columns are relevant datapoints, none are droped 
data.head()


In [None]:
#demonstrates Nan values within the dataset
#white strips represents Nan values in a column

msno.matrix(data)


In [None]:
#drops rows with Nan values
data.dropna(inplace=True)

#demonstrates the data shape to confirm the treated data has at least 1000 entrie
print("---------------------------------------------------")
print("Dataset's Shape: ", data.shape)

print("--------------------------------------------------- ")
print("Null Values: ")
print(data.isna().sum())

print("--------------------------------------------------- ")
print("Data Info: ")
print(data.info())


In [None]:
#As all columns are relevant datapoints, none are droped 
data.head()


In [None]:
#demonstrates Nan values within the dataset
#lack of white strips represents the lack of Nan values

msno.matrix(data)


In [None]:
#As target_class is already binary, theres not need to transform it into categortical values
#gives general infor about the data

data.describe().T


In [None]:
#quantifies how many pulsar stars exist in the training set
sns.countplot(x=data['target_class'],label="pulsar_star")


In [None]:
#FEATURE SELECTION
#finds correlations between data

data_corr = data.corr()
data_corr.head()


In [None]:
#heatmap analyses the feature correlation
def heatmap(data):
    plt.figure()
    sns.heatmap(data_corr)

heatmap(data_corr)


In [None]:
#cheecks and eliminates one of the features that have a correlation of over .85
corr_columns = np.full((data_corr.shape[0],), True, dtype=bool)

for i in range(data_corr.shape[0]):
    for j in range(i+1, data_corr.shape[0]):
        if data_corr.iloc[i,j] >= 0.85:
            if corr_columns[j]:
                corr_columns[j] = False

selected_columns = data.columns[corr_columns]
data_f = data[selected_columns]


#To check there are no correlation between features with values over .85
data_corr = data_f.corr()
heatmap(data_corr)


In [None]:
#PCA PLOT
def PCA_Plot(data):

    #defining variables
    data_X = data.iloc[:,0:-1].values
    data_y = pd.DataFrame(data_f.iloc[:,-1].values, columns=['target_class'])


    #Scale X values to remove mean and improve accuracy
    X_std = StandardScaler().fit_transform(data_X)


    #PCA
    #Tripathi, A. (2019) A Complete Guide to Principal Component Analysis – PCA in Machine Learning, Data Science Duniya. 
    #Available at: https://ashutoshtripathi.com/2019/07/11/a-complete-guide-to-principal-component-analysis-pca-in-machine-learning/ (Accessed: 27 April 2021).
    pca = PCA(n_components=2) 
    principalComponents = pca.fit_transform(X_std) 
    principalDf = pd.DataFrame(data=principalComponents , columns = ['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, data_y], axis = 1)


    #PCA_Plot
    plt.figure()
    plt.xlabel('Principal component 1')
    plt.ylabel('Principal component 2')
    plt.suptitle("Pulsar Stars Prediction")
    labels = ["Not a Pulsar Star","Pulsar Star"]
    scatter = plt.scatter(data=finalDf, x="principal component 1", y="principal component 2", c="target_class",cmap='Spectral', label = labels)
    plt.legend(handles=scatter.legend_elements()[0], labels=labels)


PCA_Plot(data)


In [None]:
def train_test_set(data):
    
    #defining variables
    data_X = data.iloc[:,0:-1].values
    data_y = data.iloc[:,-1].values

    #scale dataf_X values to remove mean and improve accuracy
    #not applying scaling on y_train and y_test since their values are already 0 and 1.
    X_scaler = StandardScaler().fit_transform(data_X)

    #defining training and testing variables
    X_train, X_test, y_train, y_test = train_test_split(X_scaler, data_y, test_size=0.3, random_state=0)

    return (X_train, X_test, y_train, y_test)


In [None]:
#acuracy results from models
def accuracy_results(model, X_train, y_train, X_test, y_test, y_test_pred):

    #evaluate a score by cross-validation
    scores = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')

    print("--------------------------------------------------- ")
    print("Model training accuracy: ", round(model.score(X_train, y_train), 5))
    print("Model testing accuracy: ", round(model.score(X_test, y_test), 5))
    print("Maximun Scaled accuracy: ", round(accuracy_score(y_test, y_test_pred), 5))
    print("Cross Validation Accuracy: ", round(scores.mean(), 5))
    print("--------------------------------------------------- \n")



In [None]:
#LEARNING MODELS

'''Logistic Regression'''
def logistic_reg(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)


    #training the model
    model_LR = LogisticRegression()
    model_LR = model_LR.fit(X_train, y_train)
    X_train_pred = model_LR.predict(X_train)
    y_test_pred = model_LR.predict(X_test)


    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')

    #prints accuracies results and scores
    accuracy_results(model_LR, X_train, y_train, X_test, y_test, y_test_pred)

    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")


In [None]:
#Logistic Regression with original data
logistic_reg(data)

In [None]:
#Logistic Regression with data with feature scalling (data_f)
logistic_reg(data_f)

In [None]:
#Logistic Regression with grid search
def logistic_reg_grid_search(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)

    
    #GRID SEARCH

    #Defining parameters
    #defining solvers optimises the algorithm
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    #penalises the hyperparameter
    penalty = ['l1', 'l2']
    #strengh of penalty
    c_param = [100, 10, 1.0, 0.1, 0.01, 0.001]

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    param_GS = dict(solver=solvers, penalty=penalty, C=c_param)
    model_GS = GridSearchCV(estimator=LogisticRegression(), param_grid=param_GS, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    
    #fitting the model
    model_GS = model_GS.fit(X_train, y_train)
    X_train_pred = model_GS.predict(X_train)
    y_test_pred = model_GS.predict(X_test)

    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')

    #prints accuracies results and scores
    print("--------------------------------------------------- ")
    print("Best Accuracy: ", round(model_GS.best_score_, 5))
    print("Best hyperparameters: ", model_GS.best_params_)
    accuracy_results(model_GS, X_train, y_train, X_test, y_test, y_test_pred)

    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")


In [None]:
#Logistic regression with original data and grid search
logistic_reg_grid_search(data)

In [None]:
#Logistic regression with feature selection data and grid search
logistic_reg_grid_search(data_f)

In [None]:
#Logistic Regression with random search
def logistic_reg_random_search(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)

    
    #RANDOM SEARCH

    #Defining parameters
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l1', 'l2']
    c_param = [100, 10, 1.0, 0.1, 0.01, 0.001]

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    param_RS = dict(solver=solvers, penalty=penalty, C=c_param)
    
    model_RS = RandomizedSearchCV(LogisticRegression(), param_RS, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    
    #fitting the model
    model_RS = model_RS.fit(X_train, y_train)
    X_train_pred = model_RS.predict(X_train)
    y_test_pred = model_RS.predict(X_test)

    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')

    #prints accuracies results and scores
    print("--------------------------------------------------- ")
    print("Best Accuracy: ", round(model_RS.best_score_, 5))
    print("Best hyperparameters: ", model_RS.best_params_)
    accuracy_results(model_RS, X_train, y_train, X_test, y_test, y_test_pred)

    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")


In [None]:
#Logistic regression with original data and random search
logistic_reg_random_search(data)

In [None]:
#Logistic regression with feature selection data and random search
logistic_reg_random_search(data_f)

In [None]:
'''KNeighbours'''
def KN_Neighbors(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)


    #training the model
    model_KN = KNeighborsClassifier()
    model_KN = model_KN.fit(X_train, y_train)
    X_train_pred = model_KN.predict(X_train)
    y_test_pred = model_KN.predict(X_test)


    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')


    #prints accuracies results and scores
    accuracy_results(model_KN, X_train, y_train, X_test, y_test, y_test_pred)


    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")


In [None]:
#KNNeighbors with original data
KN_Neighbors(data)

In [None]:
#KNNeighbors with feature selection data
KN_Neighbors(data_f)

In [None]:
#KNNeighbors with grid search
def KN_Neighbors_grid_search(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)

    
    #GRID SEARCH

    #Defining parameters
    n_neighbors = range(1, 31)
    #checks uniform or distance
    weights = ['uniform', 'distance']
    #metrics
    metric = ['euclidian', 'manhattan', 'minkowski']

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    param_GS = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
    model_GS = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_GS, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    
    #fitting the model
    model_GS = model_GS.fit(X_train, y_train)
    X_train_pred = model_GS.predict(X_train)
    y_test_pred = model_GS.predict(X_test)

    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')

    #prints accuracies results and scores
    print("--------------------------------------------------- ")
    print("Best Accuracy: ", round(model_GS.best_score_, 5))
    print("Best hyperparameters: ", model_GS.best_params_)
    accuracy_results(model_GS, X_train, y_train, X_test, y_test, y_test_pred)

    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")


In [None]:
#KNNeighbors with original data with grid search
KN_Neighbors_grid_search(data)

In [None]:
#KNNeighbors with feature selection data with grid search
KN_Neighbors_grid_search(data_f)

In [None]:
#KNNeighbors with random search
def KN_Neighbors_random_search(data):

    #gets vars dataf_X, dataf_y, X_train, X_test, y_train, y_test with data
    X_train, X_test, y_train, y_test = train_test_set(data)


    #Defining parameters
    n_neighbors = range(1, 31, 2)
    #checks uniform or distance
    weights = ['uniform', 'distance']
    #metrics
    metric = ['euclidian', 'manhattan', 'minkowski']

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    param_RS = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)
    model_RS = RandomizedSearchCV(KNeighborsClassifier(), param_RS, n_jobs=-1, cv=cv, scoring='accuracy', error_score=0)
    
    #fitting the model
    model_RS = model_RS.fit(X_train, y_train)
    X_train_pred = model_RS.predict(X_train)
    y_test_pred = model_RS.predict(X_test)

    #Confusion Matrix Normalized 
    conf_matrix = confusion_matrix(y_test, y_test_pred, normalize='all')

    #prints accuracies results and scores
    print("--------------------------------------------------- ")
    print("Best Accuracy: ", round(model_RS.best_score_, 5))
    print("Best hyperparameters: ", model_RS.best_params_)
    accuracy_results(model_RS, X_train, y_train, X_test, y_test, y_test_pred)

    #heatmap plot of CONFUSION MATRIX
    labels = ['NOT_pulsar_star', 'pulsar_star']
    heatmap = sns.heatmap(conf_matrix, annot=True)
    heatmap.set_xticklabels(labels)
    heatmap.set_yticklabels(labels)
    heatmap.set(ylabel="Real values", xlabel="Predicted values")



In [None]:
#KNNeighbors with original data with random search
KN_Neighbors_random_search(data)

In [None]:
#KNNeighbors with feature selection data and random search
KN_Neighbors_random_search(data_f)