In [None]:
import pandas as pd
import numpy as np

# https://imbalanced-learn.org/stable/

In [None]:
pip install imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Using cached imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.14.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=5000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=3,
                           n_clusters_per_class=1,
                           weights=[0.01, 0.05, 0.94],
                           class_sep=0.8, random_state=0)


from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)


In [5]:
# shape of X_resampled
X_resampled.shape

(14022, 2)

In [2]:
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
df = pd.concat([X, y.rename('target')], axis=1)
df.head(2)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0


In [3]:
# lets split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [9]:
# here 0 is for setosa, 1 is for versicolor and 2 is for virginica
# see the distribution of target variable in train and test set
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(0    0.333333
 2    0.333333
 1    0.333333
 Name: proportion, dtype: float64,
 0    0.333333
 2    0.333333
 1    0.333333
 Name: proportion, dtype: float64)

In [6]:
# lets build model without standardization
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

In [7]:
# lets try with standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
accuracy_score(y_test, y_pred)

0.9333333333333333

In [16]:
from sklearn.datasets import load_diabetes
import numpy as np
data = load_diabetes()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
df = pd.concat([X, y.rename('target')], axis=1)
df.head(2)

# objective is to classify whether the person has diabetes or not
# lets create a binary target variable
df['target'] = np.where(df['target'] > df['target'].median(), 1, 0)
df['target'].value_counts()

X = df.drop('target', axis=1)
y = df['target']

# here 1 is for diabetes and 0 is for no diabetes

In [13]:
df.head(2)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,1
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,0


In [17]:
# lets split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((353, 10), (89, 10), (353,), (89,))

In [33]:
col_to_select = [ 'age','sex', 'bmi', 'bp']

In [34]:
X_train.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [35]:
# RFE for knn
from sklearn.feature_selection import RFE # this will only work for coeff based model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train[col_to_select], y_train)

knn.predict(X_test[col_to_select])
y_pred = knn.predict(X_test[col_to_select])
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


0.6629213483146067

In [19]:
# categorical features
import pandas as pd 
import numpy as np
# lets create atleast 5000 rows of data
df = pd.DataFrame({
    'color': np.random.choice(['red', 'blue', 'green'], size=5000),
    'size': np.random.choice(['S', 'M', 'L'], size=5000),
    'price': np.random.randint(10, 100, size=5000),
    'weight': np.random.randint(1, 20, size=5000),
    'target': np.random.choice([0, 1], size=5000)
})

df.head()


Unnamed: 0,color,size,price,weight,target
0,green,M,16,19,0
1,blue,S,44,15,0
2,green,S,16,17,0
3,red,M,29,3,1
4,red,S,28,16,1


In [20]:
# lets split the data
from sklearn.model_selection import train_test_split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# lets train the model without encoding
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

ValueError: could not convert string to float: 'red'

In [21]:
# One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe.fit(df[['color', 'size']])
encoded_features = ohe.transform(df[['color', 'size']])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(['color', 'size']))
encoded_df.head()

# split the data again
X = pd.concat([encoded_df, df[['price', 'weight']]], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# train the model again
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.51

In [22]:
y_pred = knn.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_pred)

0.68125

In [10]:
# lets do the single prediction
new_data = pd.DataFrame({
    'color': ['red'],
    'size': ['M'],
    'price': [50],
    'weight': [10]
})
new_data.head() 

# lets pass the new data to the model
# first we need to encode the categorical features
encoded_new_data = ohe.transform(new_data[['color', 'size']])
encoded_new_data_df = pd.DataFrame(encoded_new_data, columns=ohe.get_feature_names_out(['color', 'size']))
final_new_data = pd.concat([encoded_new_data_df, new_data[['price', 'weight']]], axis=1)


# lets pass it to the model
knn.predict(final_new_data)

array([0])

### Missing value Imputation

In [11]:
# lets create the data with missing values
data = {
    'feature1': [1, 2, np.nan, 4, 5, np.nan, 7, 8, 9, 10],
    'feature2': [np.nan, 1.5, 2.5, 3.5, np.nan, 5.5, 6.5, 7.5, 8.5, 9.5],
    'target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,feature1,feature2,target
0,1.0,,0
1,2.0,1.5,1
2,,2.5,0
3,4.0,3.5,1
4,5.0,,0


In [12]:
# lets intiliazie the model
knn= KNeighborsClassifier(n_neighbors=3)
knn.fit(df[['feature1', 'feature2']], df['target'])


ValueError: Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# lets apply mean imputation
df_imputed = df.fillna(df.mean())



Unnamed: 0,feature1,feature2,target
0,1.0,5.625,0
1,2.0,1.5,1
2,5.75,2.5,0
3,4.0,3.5,1
4,5.0,5.625,0


In [14]:
knn= KNeighborsClassifier(n_neighbors=3)
knn.fit(df_imputed[['feature1', 'feature2']], df_imputed['target'])
knn.predict([[6, 6]])



array([0])