In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
!gdown 1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/multiclass.csv
  0% 0.00/14.6k [00:00<?, ?B/s]100% 14.6k/14.6k [00:00<00:00, 37.8MB/s]


In [4]:
df = pd.read_csv('/content/multiclass.csv')
display(df.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3,12669,9656,7561,214,2674,1338,2
1,3,7057,9810,9568,1762,3293,1776,2
2,3,6353,8808,7684,2405,3516,7844,2
3,3,13265,1196,4221,6404,507,1788,1
4,3,22615,5410,7198,3915,1777,5185,1


In [5]:
print("Number of null values per column:")
print(df.isnull().sum())

Number of null values per column:
Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Region            440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-null    int64
 7   class             440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


# Task
Standardize the numerical columns in the dataframe.

## Identify columns for standardization

### Subtask:
Determine which columns are numerical and should be standardized.


**Reasoning**:
Inspect the DataFrame to identify numerical columns and create a list of their names, excluding the 'class' and 'Region' columns as they are likely categorical or target variables based on the column names and previous steps.



In [7]:
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
cols_to_standardize = [col for col in numerical_cols if col not in ['class', 'Region']]
print("Numerical columns to standardize:", cols_to_standardize)

Numerical columns to standardize: ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']


## Import standardscaler

### Subtask:
Import the `StandardScaler` from scikit-learn.


**Reasoning**:
Import the StandardScaler class from the sklearn.preprocessing module.



In [8]:
from sklearn.preprocessing import StandardScaler

## Initialize standardscaler

### Subtask:
Create an instance of the `StandardScaler`.


**Reasoning**:
Instantiate the StandardScaler object as requested.



In [9]:
scaler = StandardScaler()

## Fit and transform the data

### Subtask:
Apply the `StandardScaler` to the selected columns of the DataFrame.


**Reasoning**:
Apply the StandardScaler to the selected columns and store the result in a new variable.



In [10]:
df_scaled = scaler.fit_transform(df[cols_to_standardize])

## Replace original columns

### Subtask:
Replace the original columns with the standardized data.


**Reasoning**:
Create a new DataFrame from the scaled data and replace the original columns in the original DataFrame with the scaled columns.



In [11]:
df_scaled_df = pd.DataFrame(df_scaled, columns=cols_to_standardize)
df[cols_to_standardize] = df_scaled_df
display(df.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3,0.052933,0.523568,-0.041115,-0.589367,-0.043569,-0.066339,2
1,3,-0.391302,0.544458,0.170318,-0.270136,0.086407,0.089151,2
2,3,-0.447029,0.408538,-0.028157,-0.137536,0.133232,2.243293,2
3,3,0.100111,-0.62402,-0.392977,0.687144,-0.498588,0.093411,1
4,3,0.840239,-0.052396,-0.079356,0.173859,-0.231918,1.299347,1


## Verify standardization

### Subtask:
Display the descriptive statistics of the standardized columns to confirm the mean is close to 0 and the standard deviation is close to 1.


**Reasoning**:
Display the descriptive statistics of the standardized columns to confirm the mean is close to 0 and the standard deviation is close to 1.



In [None]:
display(df[cols_to_standardize].describe())

## Summary:

### Data Analysis Key Findings

*   The numerical columns identified for standardization were 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents\_Paper', and 'Delicassen'. The columns 'class' and 'Region' were excluded.
*   The `StandardScaler` from `sklearn.preprocessing` was successfully imported and initialized.
*   The `fit_transform` method of the `StandardScaler` was applied to the selected numerical columns, resulting in a NumPy array of standardized values.
*   The original numerical columns in the DataFrame `df` were successfully replaced with the standardized values.
*   Descriptive statistics of the standardized columns show that the mean of each standardized column is very close to 0 and the standard deviation is very close to 1, confirming successful standardization.

### Insights or Next Steps

*   The standardized numerical features are now ready for use in machine learning models or other analyses that benefit from scaled data.
*   Consider exploring the distribution of the standardized features to identify any outliers or skewness that might require further preprocessing.


In [12]:
class_counts = df['class'].value_counts()
print("Number of data points in each class:")
print(class_counts)

Number of data points in each class:
class
2    180
3    173
1     87
Name: count, dtype: int64


# Task
Balance the dataset using SMOTE.

## Check for class imbalance

### Subtask:
Verify if the dataset has imbalanced classes by counting the number of data points in each class.


## Install imbalanced-learn

### Subtask:
Install the `imbalanced-learn` library which contains the SMOTE implementation.


**Reasoning**:
Install the `imbalanced-learn` library using pip.



In [13]:
!pip install imbalanced-learn



## Import smote

### Subtask:
Import the `SMOTE` class from `imblearn.over_sampling`.


**Reasoning**:
Import the SMOTE class from the imblearn.over_sampling module.



In [14]:
from imblearn.over_sampling import SMOTE

## Separate features and target

### Subtask:
Separate the features (X) and the target variable (y) from the DataFrame.


**Reasoning**:
Separate the features (X) and the target variable (y) from the DataFrame and display the head of X and y.



In [15]:
X = df.drop('class', axis=1)
y = df['class']
display(X.head())
display(y.head())

Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,3,0.052933,0.523568,-0.041115,-0.589367,-0.043569,-0.066339
1,3,-0.391302,0.544458,0.170318,-0.270136,0.086407,0.089151
2,3,-0.447029,0.408538,-0.028157,-0.137536,0.133232,2.243293
3,3,0.100111,-0.62402,-0.392977,0.687144,-0.498588,0.093411
4,3,0.840239,-0.052396,-0.079356,0.173859,-0.231918,1.299347


Unnamed: 0,class
0,2
1,2
2,2
3,1
4,1


## Apply smote

### Subtask:
Apply the SMOTE technique to generate synthetic samples for the minority class(es).


**Reasoning**:
Initialize SMOTE and apply it to the features and target variable to generate synthetic samples.



In [16]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Shape of original features:", X.shape)
print("Shape of resampled features:", X_resampled.shape)
print("Number of data points in each class after SMOTE:")
print(y_resampled.value_counts())

Shape of original features: (440, 7)
Shape of resampled features: (540, 7)
Number of data points in each class after SMOTE:
class
2    180
1    180
3    180
Name: count, dtype: int64


## Verify balancing

### Subtask:
Check the class distribution after applying SMOTE to ensure the dataset is balanced.


**Reasoning**:
Check the class distribution of the resampled target variable to ensure the dataset is balanced after applying SMOTE.



In [17]:
print("Number of data points in each class after SMOTE:")
print(y_resampled.value_counts())

Number of data points in each class after SMOTE:
class
2    180
1    180
3    180
Name: count, dtype: int64


## Summary:

### Data Analysis Key Findings

* The original dataset had a class imbalance, with varying numbers of data points in each class.
* The `imbalanced-learn` library, containing the SMOTE implementation, was confirmed to be installed.
* The features (X) and target variable (y) were successfully separated from the original DataFrame.
* After applying SMOTE, the number of data points in each class became equal, with 180 data points for each of the three classes (2, 1, and 3).
* The shape of the features changed from (440, 7) to (540, 7) after SMOTE, indicating the addition of synthetic samples.

### Insights or Next Steps

* The dataset is now balanced, which is suitable for training machine learning models that are sensitive to class imbalance.
* The balanced dataset can now be used for model training and evaluation.


In [19]:
print("Shape of original features:", X.shape)
print("Shape of resampled features:", X_resampled.shape)

Shape of original features: (440, 7)
Shape of resampled features: (540, 7)


Univariate Analysis

In [20]:
print("Variance of each feature:")
print(X_resampled.var())

Variance of each feature:
Region              0.597279
Fresh               0.872072
Milk                0.857451
Grocery             0.861009
Frozen              0.904352
Detergents_Paper    0.852811
Delicassen          0.833363
dtype: float64


Multivariate Analysis

In [21]:
correlation_matrix = X_resampled.corr()
print("Correlation matrix:")
display(correlation_matrix)

Correlation matrix:


Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
Region,1.0,0.040087,0.043011,0.014481,0.016354,0.002605,0.040464
Fresh,0.040087,1.0,0.069023,-0.04021,0.336842,-0.127652,0.227224
Milk,0.043011,0.069023,1.0,0.738351,0.114011,0.671762,0.410616
Grocery,0.014481,-0.04021,0.738351,1.0,-0.047913,0.925822,0.217305
Frozen,0.016354,0.336842,0.114011,-0.047913,1.0,-0.139935,0.3701
Detergents_Paper,0.002605,-0.127652,0.671762,0.925822,-0.139935,1.0,0.082985
Delicassen,0.040464,0.227224,0.410616,0.217305,0.3701,0.082985,1.0


In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (432, 7)
Shape of X_test: (108, 7)
Shape of y_train: (432,)
Shape of y_test: (108,)


In [74]:
print("Values of X_test.iloc[50]:")
print(X_test.iloc[50])

Values of X_test.iloc[50]:
Region              1.000000
Fresh               0.557097
Milk               -0.598149
Grocery            -0.602627
Frozen             -0.007514
Detergents_Paper   -0.533277
Delicassen          0.175714
Name: 459, dtype: float64


In [85]:
def knn(X,Y,queryPoint,k):
    """Predict the class label for the query point"""
    # Euclidean Distance
    dist = np.sqrt(np.sum((queryPoint-X)**2,axis=1) )

    # Storing distance and Class labels together
    distances = [(dist[i],Y[i]) for i in range(len(dist)) ]
    # sort the distances
    distances = sorted(distances)
    # Nearest/First K points
    distances = distances[:k]

    distances = np.array(distances)

    classes_counts = np.unique(distances[:,1],return_counts=True)

    index = classes_counts[1].argmax()
    pred = classes_counts[0][index]

    return int(pred),distances

In [86]:
pred,neighbors = knn(X_train, y_train, X_test.iloc[50],5)

print(f'k nearest neighbors with the distance and class label:{neighbors}')

print(f'Predicted class label:{pred}')

KeyError: 1

In [84]:
pred, neighbors = knn(X_train, y_train, X_train.iloc[50].to_numpy(), k=5)
display(pred)
display(neighbors)

TypeError: 'KNeighborsClassifier' object is not callable

In [80]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

In [81]:
knn = KNeighborsClassifier(n_neighbors=5,metric='euclidean')
knn.fit(X_train, y_train)

In [82]:
knn.score(X_test,y_test)

0.9166666666666666

In [83]:
from sklearn.metrics import confusion_matrix
y_pred = knn.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
display(cm)

array([[134,   2,   8],
       [ 14, 127,   3],
       [  8,   5, 131]])