# Resampling
* Under Sampling - Removing Samples randomly from the Majority Class
* Over Sampling - Adding Samples randomly to the Minority Class 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Social_Network_Ads.csv")
df.shape

(400, 5)

In [3]:
df.head(2)

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0


In [None]:
sns.countplot(x = 'Purchased', data = df);

In [None]:
df['Purchased'].value_counts()

In [None]:
class_count_0, class_count_1 = df['Purchased'].value_counts()
print(class_count_0)
print(class_count_1)

In [None]:
# Separate the Classes
class_0 = df[df['Purchased'] == 0]
class_1 = df[df['Purchased'] == 1]

In [None]:
class_0.shape

In [None]:
class_1.shape

# Random Under Sampling
* Try using the same with Imblearn package as well

In [None]:
under = class_0.sample(class_count_1)

In [None]:
under.shape

In [None]:
finaldf = pd.concat([under, class_1])
finaldf.shape

In [None]:
finaldf.columns

In [None]:
sns.countplot(x = 'Purchased', data = finaldf);

# Random Over Sampling

In [None]:
over = class_1.sample(class_count_0, replace=True)
finaldf = pd.concat([over, class_0])

In [None]:
sns.countplot(x = 'Purchased', data = finaldf);

In [None]:
finaldf['Purchased'].value_counts()

# SMOTE - Synthetic Minority Over-sampling Technique

1. Choose 1 datapoint among the minority class(currently 143 minority classes - 1)
1. Find its nearest neighbour(KNN = 5)
1. Create a Synthetic datapoint between the minority class and the nearest neighbour
1. Repeats the process until the class is balanced

In [None]:
df.head()

In [4]:
x = df.drop(columns = "Purchased")
y = df.iloc[:, -1]

In [5]:
x.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,Male,19,19000
1,15810944,Male,35,20000
2,15668575,Female,26,43000
3,15603246,Female,27,57000
4,15804002,Male,19,76000


In [6]:
from sklearn.preprocessing import LabelEncoder
x['Gender'] = LabelEncoder().fit_transform(x['Gender'])

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# Used for secondary data structures - Counter - Counts the frequency of values
from collections import Counter

In [None]:
# import library
from imblearn.over_sampling import SMOTE

smote = SMOTE()

# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(x, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_smote))

In [None]:
x_smote['Class'] = y_smote

In [None]:
x_smote.head()

In [None]:
x_smote.shape

In [None]:
sns.countplot(x = 'Class', data = x_smote);

# Weights
* use utils package for assigning weights

In [7]:
# load library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

# we can add class_weight='balanced' to add panalize mistake
rfc_model = RandomForestClassifier(class_weight='balanced')

rfc_model.fit(x, y)

rfc_predict = rfc_model.predict(x)# check performance
print('ROCAUC score:',roc_auc_score(y, rfc_predict))
print('Accuracy score:',accuracy_score(y, rfc_predict))
print('F1 score:',f1_score(y, rfc_predict))

ROCAUC score: 1.0
Accuracy score: 1.0
F1 score: 1.0
