# RESEARCH WORK

In [None]:
!pip install -q kaggle

In [None]:
import os
os.chdir("/kaggle/input/predicting-pulsar-starintermediate")
!ls

# Data loading and viewing

In [None]:
!pip install --upgrade seaborn

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline

train = pd.read_csv('pulsar_data_train.csv') 
train.head(10) 
np.random.seed(123)

In [None]:
train.tail(10)

We can see that multiple NaN values exist in 'Standard deviation of the DM-SNR curve' (missing values)

In [None]:
train.describe(include='all') 

In [None]:
train.dtypes

All continuous values

## NULL Detection

In [None]:
sns.heatmap(train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
for feature in train.columns:
    print('Missing values in feature ' + str(feature) + ' : ' + str(len(train[train[feature].isnull() == True])))

3 columns have null values (Roughly 5% or 10%)

In [None]:
train.isnull().sum()

# Feature analysis

In [None]:
plt.figure(figsize = (10, 8))
total = float(len(train))
ax = sns.countplot(x = 'target_class', data = train)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,height + 3,'{:1.2f}'.format(height/total),ha="center") 

Class imbalance (10:1 almost)

In [None]:
print('No. of instances pulsar stars are detected in dataset is ' + str(len(train[train['target_class'] == 1])))
print('No. of instances pulsar stars are not detected in dataset is ' + str(len(train[train['target_class'] == 0])))

In [None]:
continous_features = list(set(train.columns) - set(['target_class']))
continous_features.sort()
continous_features

### Histogram Plots

In [None]:
fig, axes = plt.subplots(nrows=len(continous_features),ncols=2,  figsize=(15, 40))
for i in range(len(continous_features)):
    feature = continous_features[i]
    plt.figure(figsize = (5, 5))
    data=train.copy()
    sns.histplot(x=data[feature].dropna(), ax=axes[i][0])
    sns.boxplot(x=data[feature].dropna(), ax=axes[i][1])

### Target class based distributions

In [None]:
train_dummy = train.copy()
for feature in continous_features:
  fig, axs = plt.subplots(figsize=(22, 9))
  sns.histplot(train_dummy[train_dummy['target_class']==0][feature].dropna(), color='red')
  sns.histplot(train_dummy[train_dummy['target_class']==1][feature].dropna(), color='blue')

  plt.legend([0, 1], loc='upper right', prop={'size': 15})
  plt.show()

We see that most features come from independent distributions, however have significant overlap.

### Outlier detection

In [None]:
for i in range(len(continous_features)):
  feature = continous_features[i]
  plt.figure(figsize = (10, 5))
  sns.boxplot(x = 'target_class', y = continous_features[i], data = train)
  plt.grid()
  plt.show()

## Correlation Heatmap

In [None]:
plt.figure(figsize = (10, 10))
corr_mat = train.corr()
sns.heatmap(corr_mat, xticklabels = corr_mat.columns, yticklabels = corr_mat.columns, annot=True)

# Missing Data Calculation

In [None]:
for feature in train.columns:
    print('Missing values in feature ' + str(feature) + ' : ' + str(len(train[train[feature].isnull() == True])))

In [None]:
train.isnull().sum()

In [None]:
train.isnull().sum()/len(train) * 100

In [None]:
!pip install scikit-learn

In [None]:
null_data = train[train.isnull().any(axis=1)]

In [None]:
null_data.head()

In [None]:
null_data.tail()

In [None]:
null_data.isnull().sum()/len(train) * 100

In [None]:
len(null_data)

Null in 2 columns

In [None]:
train_temp = train[train[' Excess kurtosis of the integrated profile'].isnull() & 
      train[' Standard deviation of the DM-SNR curve'].isnull()]
train_temp.head()

In [None]:
print(len(train_temp))

In [None]:
train_temp = train[train[' Excess kurtosis of the integrated profile'].isnull() & 
                   train[' Skewness of the DM-SNR curve'].isnull()]
train_temp.head()

In [None]:
print(len(train_temp))

In [None]:
train_temp = train[train[' Skewness of the DM-SNR curve'].isnull() & 
      train[' Standard deviation of the DM-SNR curve'].isnull()]
train_temp.head()

In [None]:
len(train_temp)

In [None]:
train_temp = train[train[' Excess kurtosis of the integrated profile'].isnull() & 
      train[' Standard deviation of the DM-SNR curve'].isnull() & 
      train[' Skewness of the DM-SNR curve'].isnull()]
train_temp.head()

In [None]:
len(train_temp)

Now we impute the missing values using an iterative imputer (using all other features to find the missing values)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge, LinearRegression, SGDRegressor, ARDRegression

In [None]:
null_data

So, we use bayesian ridge estimator to find the missing values from all other available attributes

In [None]:
# ExtraTreesRegressor,BayesianRidge, LinearRegression, SGDRegressor, ARDRegression
imputer = IterativeImputer(BayesianRidge(), sample_posterior=True, max_iter=100, verbose=1)
impute_data = pd.DataFrame(imputer.fit_transform(train), columns=train.columns.values.tolist())

In [None]:
train[[' Excess kurtosis of the integrated profile',
                        ' Standard deviation of the DM-SNR curve', 
                        ' Skewness of the DM-SNR curve']].describe(include='all') 

In [None]:
impute_data[[' Excess kurtosis of the integrated profile',
                        ' Standard deviation of the DM-SNR curve', 
                        ' Skewness of the DM-SNR curve']].describe(include='all') 

Comparing the mean and std of the train and imputed train datasets, we see they are almost the same. Moreover, our iterative imputer looks at all available features to predict the missing values

In [None]:
impute_data.iloc[list(null_data.index)]

In [None]:
impute_data.isnull().sum()

# Outlier Treatment

In [None]:
Q1 = train.quantile(0.25)
Q3 = train.quantile(0.75)
IQR = Q3 - Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
print('Number of Outliers (Percentage):')
((train < (lower_range)) | (train > (upper_range))).sum()

In [None]:
Q1 = impute_data.quantile(0.25)
Q3 = impute_data.quantile(0.75)
IQR = Q3 - Q1
lower_range= Q1-(1.5 * IQR)
upper_range= Q3+(1.5 * IQR)
print('Number of Outliers (Percentage):')
((impute_data < (lower_range)) | (impute_data > (upper_range))).sum()/len(impute_data) * 100

Since there are not many outliers (less than 10 percent for most columns) we can either remove them or cap them.
However, removing them is not advised, so we will cap them using IQR

In [None]:
train_impute_out = impute_data.copy()

In [None]:
for cols in train_impute_out.columns[:-1]:
  train_impute_out[cols] = np.where(train_impute_out[cols]>upper_range[cols],
                                    upper_range[cols],train_impute_out[cols])
  train_impute_out[cols] = np.where(train_impute_out[cols]<lower_range[cols],
                                    lower_range[cols],train_impute_out[cols])

In [None]:
train.describe(include='all')

In [None]:
cols = list(train.columns)
cols.reverse()
cols

In [None]:
plt.figure(figsize=(10,8))
plt.title('Base data')
train.boxplot(vert=0, column=cols)
plt.xlim(-200, 1300)

In [None]:
train_impute_out.describe(include='all')

In [None]:
plt.figure(figsize=(10,8))
plt.title('Final data')
train_impute_out.boxplot(vert=0, column=cols)
plt.xlim(-200, 1300)

# Feature selection

Since there are only 8 features, dimensionality reduction is not nescessary 

In [None]:
plt.figure(figsize = (10, 10))
corr_mat = train_impute_out.corr()
sns.heatmap(corr_mat, xticklabels = corr_mat.columns, yticklabels = corr_mat.columns, annot=True)

In [None]:
train_final = train_impute_out.drop([' Excess kurtosis of the integrated profile', 
                                     ' Skewness of the DM-SNR curve', 
                                     ' Standard deviation of the DM-SNR curve'], axis=1)

In [None]:
train_final

In [None]:
plt.figure(figsize = (10, 10))
corr_mat = train_final.corr()
sns.heatmap(corr_mat, xticklabels = corr_mat.columns, yticklabels = corr_mat.columns, annot=True)

Splitting into Train and Test

In [None]:
from sklearn.model_selection import train_test_split
y = train_final['target_class']
x = train_final.copy().drop(['target_class'], axis = 1)

In [None]:
x

In [None]:
y

In [None]:
x_train, x_test1, y_train, y_test1 = train_test_split(x, y, test_size=0.40, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test1, y_test1, test_size=0.50, random_state=42)

In [None]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)


In [None]:
x.describe(include='all')

In [None]:
x_train.describe(include='all')

In [None]:
x_val.describe(include='all')

In [None]:
x_test.describe(include='all')

In [None]:
y.describe(include='all')

In [None]:
y_train.describe(include='all')

In [None]:
y_val.describe(include='all')

In [None]:
y_test.describe(include='all')

The splits are similarly distributed. They can be used to train the model

# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import ExtraTreeClassifier, DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, IsolationForest, StackingClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

!pip install xgboost
from xgboost import XGBClassifier


### Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred_logistic = model.predict(x_val)
print(accuracy_score(y_val, y_pred_logistic))
logisticAccuracy = accuracy_score(y_val, y_pred_logistic)
cmat = confusion_matrix(y_val, y_pred_logistic)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='Blues')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(logisticAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_logistic))

### Ridge Classifier

In [None]:
model = RidgeClassifier()
model.fit(x_train, y_train)
y_pred_R = model.predict(x_val)
RAccuracy = accuracy_score(y_val, y_pred_R)
cmat = confusion_matrix(y_val, y_pred_R)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='Greens')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(RAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_R))

### ExtraTree Classifier

In [None]:
model = ExtraTreeClassifier()
model.fit(x_train, y_train)
y_pred_ET = model.predict(x_val)
ETAccuracy = accuracy_score(y_val, y_pred_ET)
cmat = confusion_matrix(y_val, y_pred_ET)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='Reds')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(ETAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_ET))

### Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred_DT = model.predict(x_val)
DTAccuracy = accuracy_score(y_val, y_pred_DT)
cmat = confusion_matrix(y_val, y_pred_DT)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='cubehelix')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(DTAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_DT))

### Random Forest Classifier

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred_RF = model.predict(x_val)
RFAccuracy = accuracy_score(y_val, y_pred_RF)
cmat = confusion_matrix(y_val, y_pred_RF)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='Spectral')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(RFAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_RF))

### MLP Classifier

In [None]:
model = MLPClassifier()
model.fit(x_train, y_train)
y_pred_mlp = model.predict(x_val)
MLPAccuracy = accuracy_score(y_val, y_pred_mlp)
cmat = confusion_matrix(y_val, y_pred_mlp)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='viridis')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(MLPAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_mlp))

## AdaBoost Classifier

In [None]:
model = AdaBoostClassifier()
model.fit(x_train, y_train)
y_pred_ada = model.predict(x_val)
ADAAccuracy = accuracy_score(y_val, y_pred_ada)
cmat = confusion_matrix(y_val, y_pred_ada)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='YlOrBr')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(ADAAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_ada))

## Bagging Classifier

In [None]:
model = BaggingClassifier()
model.fit(x_train, y_train)
y_pred_B = model.predict(x_val)
BAccuracy = accuracy_score(y_val, y_pred_B)
cmat = confusion_matrix(y_val, y_pred_B)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='icefire')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(BAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_B))

## XGBoost Classifier

In [None]:
model = XGBClassifier()
model.fit(x_train, y_train)
y_pred_XGB = model.predict(x_val)
XGBAccuracy = accuracy_score(y_val, y_pred_XGB)
cmat = confusion_matrix(y_val, y_pred_XGB)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='mako')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(XGBAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_XGB))

## Naive Bayes Classifier

In [None]:
model = GaussianNB()
model.fit(x_train, y_train)
y_pred_NB = model.predict(x_val)
NBAccuracy = accuracy_score(y_val, y_pred_NB)
cmat = confusion_matrix(y_val, y_pred_NB)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='crest')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(NBAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_NB))

## K Nearest Neighbour Classifier

In [None]:
model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_pred_KNN = model.predict(x_val)
KNNAccuracy = accuracy_score(y_val, y_pred_KNN)
cmat = confusion_matrix(y_val, y_pred_KNN)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='magma')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(KNNAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_KNN))

## Stochastic Gradient Descent Classifier

In [None]:
model = SGDClassifier()
model.fit(x_train, y_train)
y_pred_SGD = model.predict(x_val)
SGDAccuracy = accuracy_score(y_val, y_pred_SGD)
cmat = confusion_matrix(y_val, y_pred_SGD)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='vlag')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(SGDAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_pred_SGD))

# Results

In [None]:
def combine():
    y_final = []
    for i in range(len(y_pred_R)):
        result = logisticAccuracy * y_pred_logistic[i] + RAccuracy * y_pred_R[i] + ETAccuracy * y_pred_ET[i] + DTAccuracy * y_pred_DT[i] + RFAccuracy * y_pred_RF[i] + MLPAccuracy * y_pred_mlp[i] + ADAAccuracy * y_pred_ada[i] + y_pred_B[i] * BAccuracy + 15 * XGBAccuracy * y_pred_XGB[i] + NBAccuracy * y_pred_NB[i] + KNNAccuracy * y_pred_KNN[i] +  y_pred_SGD[i] * SGDAccuracy
        result = result / 26
        if result >= 0.15:
            result = 1
        else:
            result = 0
            
        y_final.append(result)
    return y_final

In [None]:
y_final = combine()
FAccuracy = accuracy_score(y_val, y_final)
cmat = confusion_matrix(y_val, y_final)
plt.plot(figsize=(10,10))
sns.heatmap(cmat,annot=True,fmt=".3f",linewidths=.5,linecolor='Black',square=True,cmap='Spectral')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy : {0}%'.format(FAccuracy*100)
plt.title(all_sample_title, size = 12)
plt.show()
print(classification_report(y_val, y_final))