# Oversampling 

In [78]:
# https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/#:~:text=Random%20oversampling%20involves%20randomly%20selecting,them%20from%20the%20training%20dataset.
# example of evaluating a decision tree with random oversampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split

# create dataset
X, y = make_classification(n_samples=500, weights=[0.99], flip_y=0, random_state=42)

In [81]:
from sklearn.linear_model import LogisticRegression
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=1)

In [82]:
print(Counter(y_train))
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
X_over, y_over = oversample.fit_resample(x_train, y_train)
# summarize class distribution
print(Counter(y_over))
logisticRegr = LogisticRegression()

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=1)
logisticRegr.fit(x_train, y_train)
score = logisticRegr.score(x_test, y_test)
print(score)

Counter({0: 346, 1: 4})
Counter({0: 346, 1: 346})
0.9866666666666667


# Stratified sample

In [83]:
# data: https://github.com/marquisvictor/Creating-a-Bias-Free-Testset/blob/master/housing.csv

import pandas as pd
import io
import requests

url="https://raw.githubusercontent.com/marquisvictor/Creating-a-Bias-Free-Testset/master/housing.csv"
result=requests.get(url).content
housing_df=pd.read_csv(io.StringIO(result.decode('utf-8')))

In [84]:
housing_df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [85]:
housing_df.fillna(0, inplace=True)

In [86]:
housing_df['ocean_proximity'].value_counts().sort_index()

<1H OCEAN     9136
INLAND        6551
ISLAND           5
NEAR BAY      2290
NEAR OCEAN    2658
Name: ocean_proximity, dtype: int64

In [87]:
#transform to numerical values 
values = {'<1H OCEAN':0,'INLAND':1, 'ISLAND':2,'NEAR BAY':3,'NEAR OCEAN':4}

In [88]:
housing_df['ocean_proximity'] = housing_df['ocean_proximity'].apply(lambda x: values[x])

In [89]:
#show proportion for the original dataset
(housing_df.groupby('ocean_proximity').size()/housing_df['ocean_proximity'].count())*100

ocean_proximity
0    44.263566
1    31.739341
2     0.024225
3    11.094961
4    12.877907
dtype: float64

In [90]:
#CREATE STRATIFIED SAMPLE
import numpy as np
from sklearn.model_selection import train_test_split

housing_df['ocean_proximity'] = housing_df['ocean_proximity'].astype('category')

X = housing_df.drop(['ocean_proximity'], axis=1)
y = housing_df['ocean_proximity']
categories = housing_df.select_dtypes(include=['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20, stratify=categories)

In [91]:
#CHECK THE PROPORTION 
result = pd.DataFrame(y_train)
(result.groupby('ocean_proximity').size()/result['ocean_proximity'].count())*100

ocean_proximity
0    44.264777
1    31.740552
2     0.024225
3    11.094961
4    12.875484
dtype: float64

# Train/Test/Validation sets

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

# Cross Validation


In [93]:
#https://www.ritchieng.com/machine-learning-cross-validation/
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


#read data and load X(predictors) and y(target variable)
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Execute KNN without cross validation 
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

0.9736842105263158

## Model Selection using Cross Validation

In [94]:
# 10-fold cross-validation 
knn = KNeighborsClassifier(n_neighbors=20)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

[1.         0.93333333 1.         1.         1.         0.93333333
 0.93333333 1.         1.         1.        ]
0.9800000000000001


In [95]:
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(max_iter=10000)
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.9733333333333334
