In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
seed=7

In [None]:
path='https://raw.githubusercontent.com/rohailkhan/data/main/sonar.csv'
sonar=pd.read_csv(path,header=None)
sonar.head()

## EDA

In [None]:
sonar[60].value_counts().plot(kind='barh')
plt.show()

In [None]:
plt.figure(dpi = 120,figsize= (5,4))
mask = np.triu(sonar.corr())
sns.heatmap(sonar.corr(),mask = mask, fmt = ".2f",annot=True,lw=1,cmap = 'plasma')
plt.yticks(rotation = 0)
plt.xticks(rotation = 90)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# correlation matrix
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(sonar.corr(), vmin=-1, vmax=1, interpolation='none')
fig.colorbar(cax)
fig.set_size_inches(10,10)
pyplot.show()

In [None]:
# density
sonar.plot(kind='density', subplots=True, layout=(8,8), sharex=False, legend=False, fontsize=1, figsize=(12,12))
pyplot.show()

## **Start Machine Learning**

## Split Target and Features

In [None]:
X=sonar.iloc[:,0:60].values.astype('float')
Y=sonar.iloc[:,-1].values

In [None]:
# one hot encoding for the target is not needed as we are not using deep learning model

# **My Plan** 
### Step-1 Evaluate 6 ML Algoriths with K-fold cross validation and choose the best one

### Step-2 Further tune the ML models selected from  
Note: 
1- In case of SVM ,we use  Standardization and tune hyperparameters using Grid-search with K-fold cross validation.

2-The penalty parameter **C** and **kernel types** (sigmoid,poly,linear,rbf)  needs to be tuned in grid search

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
from sklearn.preprocessing import StandardScaler

# Step-1 Evaluate 6 ML Algorithms 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
pipelines = []


ScaledLR=('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())]))
ScaledLDA=('ScaledLDA', Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())]))
ScaledKNN=('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())]))
ScaledCART=('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())]))
ScaledNB=('ScaledNB', Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())]))
ScaledSVC=('ScaledSVC', Pipeline([('Scaler', StandardScaler()), ('SVC', SVC())]))

full_ml_list=ScaledLR,ScaledLDA,ScaledKNN,ScaledCART , ScaledNB ,ScaledSVC



In [None]:
for m in full_ml_list:
  print(len(m))

In [None]:
for ml in full_ml_list:
  pipelines.append(ml)

In [None]:
len(pipelines)

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

In [None]:
results = []
names = []
for name, model in pipelines:
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
names

In [None]:
import seaborn as sns

### Plotting Model performance

In [None]:
fig, axes = plt.subplots( 1,2, figsize=(8,5)) # matplot part for defining figure and no of axes(plots)

axes[0].set_title('first chart')
axes[1].set_xlabel('y label size 15',color='r',size=15)
plt.show()

In [None]:
fig, axes = plt.subplots( 1,2, figsize=(8,5)) # matplot part for defining figure and no of axes(plots)
sns.boxplot(data=results)  # seaborn boxplot
axes[0].set_title('first chart')
axes[1].set_xlabel('y label size 15',color='r',size=15)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(data=results)
ax.set_ylabel('Accuracy', color='r',size=12)
ax.set_ylim(ymin=0.5)
plt.xticks([0,1,2,3,4,5],names, color='r',size=11)
plt.show()

### Result of Step-1 
#### KNN and SVC algorithms shows the highest accuracy

# **Step-2   Fine Tuning the best ML models (KNN & SVM)**

## **KNN Model tuning**
It has one paramter that needs to be tunes i.e number of neighbors. This will be tuned using combination of grid search and kfold cross validation

In [None]:
from sklearn.model_selection import cross_val_score , KFold ,GridSearchCV

In [None]:
standard_scalor_fitting=StandardScaler().fit(X)
rescaled_X=standard_scalor_fitting.transform(X)
seed=7
k_neighbours=[1,3,5,7,9,11,13,14,15]

param_dictionary = dict(n_neighbors=k_neighbours)  
# param dictionary for grid search

kfold=KFold(n_splits=10,shuffle=True,random_state=seed) # K fold cross validation object

grid=GridSearchCV(estimator=KNeighborsClassifier() ,param_grid=param_dictionary,cv=kfold)

grid_search_result_knn=grid.fit(rescaled_X,Y)

In [None]:
print("Best: %f using %s" % (grid_search_result_knn.best_score_, grid_search_result_knn.best_params_))

In [None]:
# we only need the above best model and best param results...but just for display..other models can also be displayed as

grid_search_result_knn.cv_results_.keys()

In [None]:
means = grid_search_result_knn.cv_results_['mean_test_score']
stds = grid_search_result_knn.cv_results_['std_test_score']
params = grid_search_result_knn.cv_results_['params']
ranks = grid_search_result_knn.cv_results_['rank_test_score']
for mean, stdev, param, rank in zip(means, stds, params, ranks):
    print("#%d %f (%f) with: %r" % (rank, mean, stdev, param))

In [None]:
grid_search_result_knn.cv_results_['rank_test_score']

## **SVM Model tuning**

Parameters of SVM are C and kernel. 

In [None]:
standard_scalor_fitting=StandardScaler().fit(X)
rescaled_X=standard_scalor_fitting.transform(X)
seed=7

c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 2.0]
kernel_values = ['linear', 'poly', 'rbf', 'sigmoid']

param_dictionary =dict(C=c_values, kernel=kernel_values)  
# param dictionary for grid search

kfold=KFold(n_splits=10,shuffle=True,random_state=seed) # K fold cross validation object

grid=GridSearchCV(estimator=SVC() ,param_grid=param_dictionary,cv=kfold)

grid_search_result_SVM=grid.fit(rescaled_X,Y)

In [None]:
print("Highest accuracy : %f using the %s" % (grid_search_result_SVM.best_score_, grid_search_result_SVM.best_params_))

## **Winner is KNN due to higher accuracy of 0.870476 using {'n_neighbors': 1}**

# Final Step : Using ensemle
## Note : There is no need to use standard scaling in ensemble

In [None]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Making ensemble list with 4 kind of classifiers
ensembles = []
# Boosting methods
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
# Bagging methods
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))

In [None]:
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=10, random_state=seed,shuffle=True) 
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    PRINT = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(PRINT)

In [None]:
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot(data=results)
ax.set_ylabel('Accuracy', color='r',size=12)
ax.set_ylim(ymin=0.6)
plt.xticks([0,1,2,3],names, color='r',size=15)
plt.show()

# **Final Selected Model is GBM**
## Now we need to split data in test and train to do predictions as we already have selected our final model

In [None]:
from sklearn.model_selection import train_test_split
seed=7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=seed)

In [None]:
# prepare model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
# using the default kernel which is RBF
model = SVC(C=1.5) 
model.fit(rescaledX, Y_train)

In [None]:
# accuracy on the test set
rescaledValidationX = scaler.transform(X_test)
predictions = model.predict(rescaledValidationX)

print(confusion_matrix(Y_test, predictions))
print(classification_report(Y_test, predictions))

In [None]:
print(accuracy_score(Y_test, predictions))

In [None]:
from sklearn.metrics import plot_confusion_matrix

disp = plot_confusion_matrix(model, rescaledValidationX , Y_test,
                             display_labels=['Rockes','Mines'],
                             cmap=plt.cm.Oranges,
                             normalize=None)
disp.ax_.set_title('Confusion matrix')

print('Train results: confusion matrix')
print(disp.confusion_matrix)

# **90% accuracy acheived by the RGB Model**