In [1]:
pip install scikit-learn imbalanced-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
from imblearn.over_sampling import ADASYN


In [3]:
# Load data from CSV file
data = pd.read_csv('/content/df_cleaned_final.csv')

In [4]:
data.head()

Unnamed: 0,age,job,marital,education,housing,loan,contact,month,day_of_week,duration,campaign,previous,poutcome,y,PC1,PC2
0,56,housemaid,married,basic.4y,no,no,telephone,may,mon,261,1,0,nonexistent,0,-1.35089,0.681003
1,57,services,married,high.school,no,no,telephone,may,mon,149,1,0,nonexistent,0,-1.35089,0.681003
2,37,services,married,high.school,yes,no,telephone,may,mon,226,1,0,nonexistent,0,-1.35089,0.681003
3,40,admin.,married,basic.6y,no,no,telephone,may,mon,151,1,0,nonexistent,0,-1.35089,0.681003
4,56,services,married,high.school,no,yes,telephone,may,mon,307,1,0,nonexistent,0,-1.35089,0.681003


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37089 entries, 0 to 37088
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          37089 non-null  int64  
 1   job          37089 non-null  object 
 2   marital      37089 non-null  object 
 3   education    37089 non-null  object 
 4   housing      37089 non-null  object 
 5   loan         37089 non-null  object 
 6   contact      37089 non-null  object 
 7   month        37089 non-null  object 
 8   day_of_week  37089 non-null  object 
 9   duration     37089 non-null  int64  
 10  campaign     37089 non-null  int64  
 11  previous     37089 non-null  int64  
 12  poutcome     37089 non-null  object 
 13  y            37089 non-null  int64  
 14  PC1          37089 non-null  float64
 15  PC2          37089 non-null  float64
dtypes: float64(2), int64(5), object(9)
memory usage: 4.5+ MB


In [6]:
print ("Number of unique values for each variable")
print("="*20)

for feature in data.columns.values:
    print(f"{feature} ({data[feature].dtypes}): {data[feature].nunique()}")
    print("-"*20)

Number of unique values for each variable
age (int64): 78
--------------------
job (object): 11
--------------------
marital (object): 3
--------------------
education (object): 7
--------------------
housing (object): 2
--------------------
loan (object): 2
--------------------
contact (object): 2
--------------------
month (object): 10
--------------------
day_of_week (object): 5
--------------------
duration (int64): 753
--------------------
campaign (int64): 7
--------------------
previous (int64): 8
--------------------
poutcome (object): 3
--------------------
y (int64): 2
--------------------
PC1 (float64): 373
--------------------
PC2 (float64): 373
--------------------


In [7]:
# Separate features and labels
features = data.drop('poutcome', axis=1)
labels = data['poutcome']

In [8]:
# Define your features and labels explicitly
categorical_features = ['job',	'marital',	'education',	'housing',	'loan',	'contact',	'month',	'day_of_week', ]  # Specify your categorical feature column names
numeric_features = ['age', 'y']
#numeric_features = ['age	', 'duration', 'campaign', 'previous', 'y	', 'PC1', 'PC2']  # Specify your numeric feature column names
labels = data['poutcome']  # Specify your label column name


In [9]:
# Perform one-hot encoding for categorical features
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoded_categorical_features = pd.DataFrame(encoder.fit_transform(data[categorical_features]))



In [10]:
# Convert feature names to string type
encoded_categorical_features.columns = encoded_categorical_features.columns.astype(str)


In [11]:
# Combine encoded categorical features and numeric features
features = pd.concat([encoded_categorical_features, data[numeric_features]], axis=1)


In [12]:
# Convert feature names to string type
features.columns = features.columns.astype(str)


In [13]:
# Create an instance of ADASYN
adasyn = ADASYN()

In [14]:
# Perform ADASYN oversampling
oversampled_features, oversampled_labels = adasyn.fit_resample(features, labels)


In [15]:
# Print the class distribution before and after ADASYN
print("Class distribution before ADASYN:")
print(labels.value_counts())
print("\nClass distribution after ADASYN:")
print(oversampled_labels.value_counts())

Class distribution before ADASYN:
nonexistent    31803
failure         4008
success         1278
Name: poutcome, dtype: int64

Class distribution after ADASYN:
nonexistent    31803
success        31545
failure        31390
Name: poutcome, dtype: int64
