Import the dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

Data collection and data processing

In [4]:
dataset = pd.read_csv("./data/sonar_data.csv", header=None)
# Generate column names
num_features = dataset.shape[1] - 1
column_names = [f'A{i+1}' for i in range(num_features)] + ['target']

# Set column names
dataset.columns = column_names
print(dataset.shape)
dataset.head()


(208, 61)


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A52,A53,A54,A55,A56,A57,A58,A59,A60,target
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [5]:
dataset.describe()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A51,A52,A53,A54,A55,A56,A57,A58,A59,A60
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [6]:
dataset['target'].value_counts()

target
M    111
R     97
Name: count, dtype: int64

More the data more accurate the model will be. Here, the data is not imbalanced because value counts of M and R is not that much different

In [7]:
# Mean value of the features for mines and rock
dataset.groupby('target').mean()

Unnamed: 0_level_0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A51,A52,A53,A54,A55,A56,A57,A58,A59,A60
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M,0.034989,0.045544,0.05072,0.064768,0.086715,0.111864,0.128359,0.149832,0.213492,0.251022,...,0.019352,0.016014,0.011643,0.012185,0.009923,0.008914,0.007825,0.00906,0.008695,0.00693
R,0.022498,0.030303,0.035951,0.041447,0.062028,0.096224,0.11418,0.117596,0.137392,0.159325,...,0.012311,0.010453,0.00964,0.009518,0.008567,0.00743,0.007814,0.006677,0.007078,0.006024


In [8]:
# Separating data and labels
X = dataset.drop(columns='target', axis=1)
Y = dataset['target']

In [9]:
# Separate into train and test set
# Stratify = ensures that the proportion of each class in the target
# variable Y is preserved in both the training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.1, stratify=Y, random_state=2
)
print(Y_train.value_counts())
print(Y_test.value_counts())

target
M    100
R     87
Name: count, dtype: int64
target
M    11
R    10
Name: count, dtype: int64


Model training using Logistic Regression as this is binary class data

In [10]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [11]:
Y_pred = model.predict(X_test)
accuracy_score(Y_test,Y_pred)

0.9047619047619048

In [28]:
y_pred_proba = model.predict_proba(X_test)[::,1] 
y_pred_proba.shape
#y_pred_proba[0,0:]

(21,)

Using the model for single instance as a predictive system

In [13]:
input_data = (
    0.0270,
    0.0092,
    0.0145,
    0.0278,
    0.0412,
    0.0757,
    0.1026,
    0.1138,
    0.0794,
    0.1520,
    0.1675,
    0.1370,
    0.1361,
    0.1345,
    0.2144,
    0.5354,
    0.6830,
    0.5600,
    0.3093,
    0.3226,
    0.4430,
    0.5573,
    0.5782,
    0.6173,
    0.8132,
    0.9819,
    0.9823,
    0.9166,
    0.7423,
    0.7736,
    0.8473,
    0.7352,
    0.6671,
    0.6083,
    0.6239,
    0.5972,
    0.5715,
    0.5242,
    0.2924,
    0.1536,
    0.2003,
    0.2031,
    0.2207,
    0.1778,
    0.1353,
    0.1373,
    0.0749,
    0.0472,
    0.0325,
    0.0179,
    0.0045,
    0.0084,
    0.0010,
    0.0018,
    0.0068,
    0.0039,
    0.0120,
    0.0132,
    0.0070,
    0.0088,
)
# change the data to a numpy array
input_data_np = np.asarray(input_data)
input_data_reshaped = input_data_np.reshape(1, -1)

prediction = model.predict(input_data_reshaped)

if prediction == ["R"]:
    print("Rock")
else:
    print("Mine")

Rock


Explain the result using eli5 [explaing like i am 5]

In [14]:
import eli5
# from eli5.sklearn import explain_weights

# Explain the weights of the logistic regression model
#explanation = explain_weights(model, feature_names=column_names[:-1])
# eli5.show_weights(explanation)

eli5.show_weights(model)

ImportError: cannot import name 'if_delegate_has_method' from 'sklearn.utils.metaestimators' (/home/shuvradeb/Machine Learning/ml-projects/.venv/lib/python3.8/site-packages/sklearn/utils/metaestimators.py)