# Introduction
#### Dataset created using satelite images of the ocean to identify oil spills. 
https://machinelearningmastery.com/imbalanced-classification-model-to-detect-oil-spills/

In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df=pd.read_csv("/kaggle/input/oil-spill/oil-spill.csv",header=None)

In [None]:
df.head()

# View Class Imbalance

In [None]:
df.iloc[:,49].value_counts()/len(df)

#### This dataset is a candidate for imblanced classification techniques since more than 80% of the samples belong to the majority class. 

In [None]:
df.iloc[:,0].value_counts()

In [None]:
df.isnull().sum().sum()

In [None]:
from sklearn.model_selection import train_test_split

train,val=train_test_split(df)
X,y=train.drop(49,axis=1),train[49]
X_train,X_test,y_train,y_test=train_test_split(df.drop(49,axis=1),df[49])

# Dummy Classifier
#### Geometric mean or G-mean combines Sensitivity and Specificity 
#### Since the uniform strategy generates predictions uniformly at random, we expect a G-mean close to 0.5 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier

model = DummyClassifier(strategy='uniform')
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)
scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)

print(scores.mean())

# Model Comparison Baseline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df1=pd.DataFrame(mean_scores,index=names,columns=['Baseline'])
df1

# Model Comparison with StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)
scaler=StandardScaler()

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df2=pd.DataFrame(mean_scores,index=names,columns=['StandardScaler'])
df3=df1.join(df2)
df3

# Model Comparison with StandardScaler and RandomOverSampler

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

scaler=StandardScaler()
ros=RandomOverSampler()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('ros',ros),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df4=pd.DataFrame(mean_scores,index=names,columns=['StandScaler+RandomOverSampler'])
df5=df3.join(df4)
df5

# Balanced Model Comparison
#### A new set of classifiers which take class_weight as a parameter

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression(solver='liblinear',class_weight='balanced')))
models.append(('CART', DecisionTreeClassifier(class_weight='balanced'))) 
models.append(('SVM', SVC(gamma='scale',class_weight='balanced')))

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df6=pd.DataFrame(mean_scores,index=names,columns=['Balanced'])
df7=df5.join(df6,how='outer')
df7

# Balanced Model Comparison with StandardScaler

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression(solver='liblinear',class_weight='balanced')))
models.append(('CART', DecisionTreeClassifier(class_weight='balanced'))) 
models.append(('SVM', SVC(gamma='scale',class_weight='balanced')))

scaler=StandardScaler()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df8=pd.DataFrame(mean_scores,index=names,columns=['Balanced+StandardScaler'])
df9=df7.join(df8,how='outer')
df9

# Model Comparison with StandardScaler,RandomOverSampler, and PowerTransformer

In [None]:
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

scaler=StandardScaler()
ros=RandomOverSampler()
pt=PowerTransformer()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('pt',pt),('ros',ros),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df10=pd.DataFrame(mean_scores,index=names,columns=['StandScaler+RandomOverSampler+PowerTransformer'])
df11=df9.join(df10)
df11


# Balanced Model Comparison with StandardScaler and PowerTransformer

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

models = []
models.append(('LR', LogisticRegression(solver='liblinear',class_weight='balanced')))
models.append(('CART', DecisionTreeClassifier(class_weight='balanced'))) 
models.append(('SVM', SVC(gamma='scale',class_weight='balanced')))

scaler=StandardScaler()
pt=PowerTransformer()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('pt',pt),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df12=pd.DataFrame(mean_scores,index=names,columns=['Balanced+StandardScaler+PowerTransformer'])
df13=df11.join(df12,how='outer')
df13

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

test = SelectKBest(score_func=f_classif, k='all')
fit = test.fit(X, y)
pd.DataFrame(fit.scores_,index=X.columns).sort_values(by=0).plot(kind='barh',figsize=(12,12))

# Model Comparison Baseline with Feature Selection

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

skb = SelectKBest(score_func=f_classif, k=10)


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('skb',skb),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df14=pd.DataFrame(mean_scores,index=names,columns=['Baseline+SKB'])
df15=df13.join(df14,how='outer')
df15

# Model Comparison with StandardScaler,RandomOverSampler,PowerTransformer, and SKB

In [None]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis())) 
models.append(('NB', GaussianNB()))

scaler=StandardScaler()
ros=RandomOverSampler()
pt=PowerTransformer()
skb = SelectKBest(score_func=f_classif, k=6)


cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
metric = make_scorer(geometric_mean_score)

names=[]
mean_scores=[]
for name,model in models:
    pipe=Pipeline([('scaler',scaler),('pt',pt),('ros',ros),('skb',skb),('model',model)])
    scores = cross_val_score(pipe, X, y, scoring=metric, cv=cv, n_jobs=-1)
    names.append(name)
    mean_scores.append(scores.mean().round(3))
df16=pd.DataFrame(mean_scores,index=names,columns=['StandScaler+RandomOverSampler+PowerTransformer+SKB'])
df17=df15.join(df16,how='outer')
df17