In [20]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
import numpy as np

# Generating a dummy imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

# Splitting dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Applying SMOTE to balance the dataset
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

# Checking the class distribution after resampling
print(f"Original dataset shape {np.bincount(y_train)}")
print(f"Resampled dataset shape {np.bincount(y_res)}")


Original dataset shape [ 74 676]
Resampled dataset shape [676 676]


In [2]:
import sys
import pandas as pd
# sys.path is a list of absolute path strings
sys.path.append('C:\Projects\Private\PropStar')
from normalization.normalize import Normalize


In [3]:
data = pd.read_csv('datasets/accelerometer_gyro_mobile_phone_dataset.csv')
data = data[~data['timestamp'].str.contains("6/25/2022")]

In [4]:
data

Unnamed: 0,accX,accY,accZ,gyroX,gyroY,gyroZ,timestamp,Activity
0,-0.496517,3.785628,8.954828,-0.142849,-0.126159,-0.022539,34:22.9,1
1,-0.462388,3.869603,9.281898,0.084349,0.096695,0.092130,34:23.0,1
2,-0.296084,3.820505,8.930728,0.061763,0.051543,0.071287,34:23.1,1
3,-0.469723,3.890110,8.744067,0.007641,0.028679,0.109433,34:23.2,1
4,-0.472418,4.109105,8.941207,-0.123640,0.099057,0.051943,34:23.3,1
...,...,...,...,...,...,...,...,...
31986,-0.488734,1.610800,10.610386,0.079187,-0.174218,-0.050365,03:15.1,1
31987,-0.049397,2.769092,7.008276,-0.083853,0.007656,-0.045658,03:15.2,1
31988,0.291294,3.002007,6.732400,0.005984,-0.058994,-0.087044,03:15.3,1
31989,0.256267,4.069138,8.687933,0.061487,-0.016278,-0.088728,03:15.4,1


In [5]:
data.groupby('Activity')['Activity'].count()

Activity
0      570
1    31420
Name: Activity, dtype: int64

In [6]:
# data['Activity'] = data['Activity'].map({0:1, 1:0})

In [7]:
data[['accX', 'accY', 'accZ']].drop_duplicates().count()
data[['gyroX', 'gyroY', 'gyroZ']].drop_duplicates().count()

gyroX    31669
gyroY    31669
gyroZ    31669
dtype: int64

In [8]:
data['timestamp'] = '2024-01-01 00:' + data['timestamp'].str.replace(',', '.')

# Convert to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')

In [9]:
data

Unnamed: 0,accX,accY,accZ,gyroX,gyroY,gyroZ,timestamp,Activity
0,-0.496517,3.785628,8.954828,-0.142849,-0.126159,-0.022539,2024-01-01 00:34:22.900,1
1,-0.462388,3.869603,9.281898,0.084349,0.096695,0.092130,2024-01-01 00:34:23.000,1
2,-0.296084,3.820505,8.930728,0.061763,0.051543,0.071287,2024-01-01 00:34:23.100,1
3,-0.469723,3.890110,8.744067,0.007641,0.028679,0.109433,2024-01-01 00:34:23.200,1
4,-0.472418,4.109105,8.941207,-0.123640,0.099057,0.051943,2024-01-01 00:34:23.300,1
...,...,...,...,...,...,...,...,...
31986,-0.488734,1.610800,10.610386,0.079187,-0.174218,-0.050365,2024-01-01 00:03:15.100,1
31987,-0.049397,2.769092,7.008276,-0.083853,0.007656,-0.045658,2024-01-01 00:03:15.200,1
31988,0.291294,3.002007,6.732400,0.005984,-0.058994,-0.087044,2024-01-01 00:03:15.300,1
31989,0.256267,4.069138,8.687933,0.061487,-0.016278,-0.088728,2024-01-01 00:03:15.400,1


In [10]:
X = data.drop('Activity', axis=1).drop('timestamp', axis=1)
y = data['Activity']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [22]:
# sm = SMOTE(random_state=42)
sm = SMOTEENN(smote=SMOTE(random_state=42),
                     enn=EditedNearestNeighbours(sampling_strategy='majority'))
X_res, y_res = sm.fit_resample(X_train, y_train)

In [23]:
# Checking the class distribution after resampling
print(f"Original dataset shape {np.bincount(y_train)}")
print(f"Resampled dataset shape {np.bincount(y_res)}")

Original dataset shape [ 74 676]
Resampled dataset shape [676 676]


In [24]:
X_res

array([[ 1.18374959,  1.04654391,  0.98678465, ..., -0.12583345,
         1.3844996 , -1.93669569],
       [-0.20298619, -0.58504162,  0.39955507, ...,  0.34749467,
         0.16888375, -2.42843063],
       [ 1.68329024,  1.39140999,  1.52528774, ...,  0.267679  ,
        -0.81818396,  0.08272461],
       ...,
       [ 0.27330794,  0.58063391, -0.24143381, ..., -0.84439832,
         0.53742459, -2.2560769 ],
       [-0.89831997,  0.68756799,  0.43798092, ...,  1.57974469,
         1.19103353, -0.40891704],
       [-2.24234157,  0.74331177,  0.77549641, ..., -1.40690827,
         0.39199296, -1.20733211]])

In [25]:
# sm = SMOTE(random_state=42)
sm = SMOTETomek(random_state=42, tomek=TomekLinks(sampling_strategy='majority'))
X_res2, y_res2 = sm.fit_resample(X_train, y_train)
# Checking the class distribution after resampling
print(f"Original dataset shape {np.bincount(y_train)}")
print(f"Resampled dataset shape {np.bincount(y_res2)}")

Original dataset shape [ 74 676]
Resampled dataset shape [676 676]
