In [None]:
## Classification algorithm in various  situation
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# lets bring imbalance dataset
from sklearn.datasets import make_classification
X_imb, y_imb = make_classification(n_samples=1000, n_features=10, n_classes=2,
                                   n_informative=2, n_redundant=2,
                                   weights=[0.9, 0.1], flip_y=0,
                                   random_state=42)

# lets convert into dataframe
import pandas as pd
df_imb = pd.DataFrame(X_imb, columns=[f'feature_{i}' for i in range(10)])
df_imb['target'] = y_imb

df_imb.head()
df_imb['target'].value_counts(normalize=True)
# lets do the experiment without handling imbalance data
X_train, X_test, y_train, y_test = train_test_split(X_imb, y_imb, test_size=0.2, random_state=42)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print('the shape X_imb',X_imb.shape)
y_imb.shape
# lets make data balanced using oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_imb, y_imb)
X_resampled.shape, y_resampled.shape

# lets train the KNN model with k=5
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_resampled, y_resampled)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


# assignment
# note down your observations
# 1. The accuracy of the model before oversampling was lower compared to after oversampling.
# 2. Oversampling helped in balancing the dataset, which in turn improved the model's performance.
# 3. The KNN model is sensitive to the distribution of the training data, and balancing the classes can lead to better generalization.

# lets create multi class imbalance datafrom sklearn.datasets import make_classification
X_multi, y_multi = make_classification(n_samples=1000, n_features=10, n_classes=3,
                                       n_informative=5, n_redundant=2,
                                       weights=[0.7, 0.2, 0.1], flip_y=0,
                                       random_state=42)
# lets convert into dataframe
df_multi = pd.DataFrame(X_multi, columns=[f'feature_{i}' for i in range(10)])
df_multi['target'] = y_multi
df_multi.head()
# split the data
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
X_test[44].reshape(1,-1)
# lets test for single value
single_value = X_test[44].reshape(1, -1)
print("Single value input:", single_value)
single_pred = model.predict(single_value)
print("Single value prediction:", single_pred)  
# time based splitting
# lets create time series data
import numpy as np
date_range = pd.date_range(start='1/1/2020', periods=100, freq='D')
X_time = np.random.rand(100, 5)
y_time = np.random.randint(0, 2, size=100)
df_time = pd.DataFrame(X_time, columns=[f'feature_{i}' for i in range(5)])
df_time['target'] = y_time
df_time['date'] = date_range
df_time.head()
# time based splitting
df_time = df_time.sort_values(by='date')
split_date = '2020-03-15'
train_data = df_time[df_time['date'] < split_date]
test_data = df_time[df_time['date'] >= split_date]

X_train = train_data.drop(columns=['target', 'date'])
y_train = train_data['target']
X_test = test_data.drop(columns=['target', 'date'])
y_test = test_data['target']

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# assignment
# note down your observations
# 1. The accuracy of the model before oversampling was lower compared to after oversampling.
# 2. Oversampling helped in balancing the dataset, which in turn improved the model's performance.
# 3. The KNN model is sensitive to the distribution of the training data, and balancing the classes can lead to better generalization.
