In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from pathlib import Path
file_path = Path("/kaggle/input/mobile-price-classification")
train_df = pd.read_csv(file_path/"train.csv")
test_df = pd.read_csv(file_path/"test.csv")

# Problem statement
This is a mobile-price classification problem that dataset is provided in Kaggle.  
With 20 features in the dataset, I have to classify each records into 4 categories (0~3).  

The target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

# EDA

## Dataset
Train dataset has 2000 records with 20 features.  
There is no null values in each columns.  

- Binary features: blue, dual_sim, four_g, three_g, touch_screen, wifi
- Continuous features: others except above.

Test dataset has 1000 records with same features.  


In [None]:
train_df.describe()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df['price_range'].value_counts()

In [None]:
test_df.shape

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
continuous_df = train_df[[c for c in train_df.columns if c not in ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']]]
with pd.option_context('mode.use_inf_as_na', True):
    sns.pairplot(continuous_df, hue='price_range', diag_kind='kde', markers=["o", "s", "D", "^"])

## Correlation matrix
The feature that is related to price range most is ram.  
The other features shows very low correlation with price range.  
Therefore, continuous variables might not be helpful to classify price range as they are.  
(transformation might be needed such as log, exp)  

In [None]:
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm')

## Transformation on continous features
I applied two transformation to the continuous features: log and exp.  
But it seems there is not remarkable improvement.  


### Log

In [None]:
train_df = pd.read_csv(file_path/"train.csv")
test_df = pd.read_csv(file_path/"test.csv")
binary_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
continuous_features = [c for c in train_df.columns if c not in binary_features + ['price_range']]
log_features = [f'{c}_log' for c in continuous_features]
train_df[log_features] = train_df[continuous_features].apply(lambda x: np.log1p(x))

In [None]:
train_df[continuous_features].head()

In [None]:
train_df[log_features].head()

In [None]:
log_corr = train_df[log_features+['price_range']].corr()
log_corr.style.background_gradient(cmap='coolwarm')

### exp

In [None]:
train_df = pd.read_csv(file_path/"train.csv")
test_df = pd.read_csv(file_path/"test.csv")
binary_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
continuous_features = [c for c in train_df.columns if c not in binary_features + ['price_range']]
exp_features = [f'{c}_exp' for c in continuous_features]
train_df[exp_features] = train_df[continuous_features].apply(lambda x: np.exp(x))

In [None]:
train_df[exp_features].head()

In [None]:
exp_corr = train_df[exp_features+['price_range']].corr()
exp_corr.style.background_gradient(cmap='coolwarm')

## Feature creation
Some features like pixcel height, pixcel width, screen height, and screen width can be multiplied to create new features.  
Thew new features such pixcel dimesion and screen dimension shows a bit higher correlation than each original features.  



In [None]:
train_df = pd.read_csv(file_path/"train.csv")
test_df = pd.read_csv(file_path/"test.csv")
binary_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
continuous_features = [c for c in train_df.columns if c not in binary_features + ['price_range']]
train_df['px_dimension'] = train_df['px_height'] * train_df['px_width']
train_df['sc_dimension'] = train_df['sc_h'] * train_df['sc_w']
corr = train_df.corr()
corr.style.background_gradient(cmap='coolwarm')

## Bar plot
Drawing bar plot for each binary features shows how records are distributed by features.  
- **Bluetuth**: class 3 mobile is likely to have bluetuth option more than other classes.
- **Dual sim**: class 3 mobile is likely to have dual sim option more than other classes.
- **Four G**: class 0,1,3 mobile is likely to have 4G option more than class 2.
- **Three G**: does not show proportion difference between classes.
- **Touch screen**: class 0,1 mobile is likely to have tourch screen option more than class 2,3
- **Wifi**: class 3 mobile is likely to have wifi option more than other classes.

In [None]:
import matplotlib.pyplot as plt

In [None]:
binary_features = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
fig, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(2,3, figsize=(10,5))
axes = [ax1, ax2, ax3, ax4, ax5, ax6]
for idx, (bf,ax) in enumerate(zip(binary_features, axes)):
    counts = train_df.groupby(['price_range', bf]).size().unstack(fill_value=0)
    counts = counts / 500 # normalize each counts as dividing by 500 (500 records for each classes)
    counts.plot(kind='bar', ax=ax)

# Data split
Data split is crucial to train model as avoiding overfitting.  
The purpose of deep learning model is generalizing patterns.  
loss from validation dataset will show how much model generalizes.  

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_df.columns

# PCA
PCA reduces dimension of dataset as get principal components.  
Since the most continuous features are uncorrelated to target, it might be better to try dimension reduction. I prepared PCA dataset as anohter option.  

PCA should include only continuous numerical features, so exclude binary features from PCA and add them as they are in final dataset.  

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
train_df.columns

In [None]:
len(continuous_features)

In [None]:
scaler = StandardScaler()
pca = PCA()
x = train_df[continuous_features]
scaled_data = scaler.fit_transform(x)
printcipalComponents = pca.fit_transform(scaled_data)

Below plot shows how much variances are explained by Principla components.  
I'll choose the point that slope starts to flatten since the flatten slope means adding more component is less efficient from that point.  
In this case, 11 components seems to be appropriate.  

In [None]:
explained_variance = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance.cumsum()
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.title('Explained Variance by Principal Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.9, color='r', linestyle='-')
plt.text(0.5, 0.85, '90% variance threshold', color = 'red', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
scaler = StandardScaler()
pca = PCA(n_components=11)
x = train_df[[c for c in train_df.columns if c not in ['price_range']]]
scaled_data = scaler.fit_transform(x)
printcipalComponents = pca.fit_transform(scaled_data)

In [None]:
pca_df = pd.DataFrame(printcipalComponents, columns=[f'PCA{i+1}' for i in range(11)])

In [None]:
pca_df.head()

I trained models as changing datasets: PCA, original. 

In [None]:
X = train_df[binary_features + ['ram', 'px_width','px_height','sc_h', 'sc_w', 'battery_power']]
# X = pd.concat([pca_df, train_df[binary_features]], axis=1)
y = train_df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42)

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
train = pd.concat([X_train, y_train], axis=1)
val = pd.concat([X_val, y_val], axis=1)
test = pd.concat([X_test, y_test], axis=1)

# Build Pytorch model
I will build deep learning model using pytorch tabular.
Pytorch tabular makes model building procedure simpler.  

In [None]:
!pip install pytorch_tabular -q

pytorch tabular package works with scikit-learn 1.2.2 version.

In [None]:
!pip install scikit-learn==1.2.2 -q

In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
)

In [None]:
train.head()

In [None]:
import warnings

warnings.filterwarnings("ignore")

# Hyper parameter tuning
I tested combination of layers, dropout, and optimizers.  
And the best combination shows **94%** of accuracy.  

In [None]:
target = ['price_range']
continuous_cols = ['ram', 'px_width','px_height','sc_h','sc_w','battery_power']
# continuous_cols = [f'PCA{i+1}' for i in range(11)]
data_config = DataConfig(
    target=target,
    continuous_cols=continuous_cols,
    categorical_cols=binary_features
)

trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=64,
    max_epochs=200,
    early_stopping="valid_loss",  # Monitor valid_loss for early stopping
    early_stopping_mode="min",  # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=5,  # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss",  # Save best checkpoint monitoring val_loss
    load_best=True,  # After training, load the best checkpoint
)
optimizer_config = OptimizerConfig()

model_config = CategoryEmbeddingModelConfig(
    task='classification',
    layers='1024-1024-512-256-128',
    activation='LeakyReLU',
    learning_rate=1e-6,
)

search_space = {
    "model_config__layers": ["1024-512-512", "1024-512-256", "1024-512-128"],
    "model_config.head_config__dropout": [0.1, 0.2, 0.3],
    "optimizer_config__optimizer": ["RAdam", "AdamW"],
}

In [None]:
from pytorch_tabular.tabular_model_tuner import TabularModelTuner

In [None]:
tuner = TabularModelTuner(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    result = tuner.tune(
        train=train,
        validation=test,
        search_space=search_space,
        strategy="grid_search",
        # cv=5, # Uncomment this to do a 5 fold cross validation
        metric="accuracy",
        mode="max",
        progress_bar=True,
        verbose=False # Make True if you want to log metrics and params each iteration
    )

In [None]:
result.trials_df.sort_values(by='accuracy')

In [None]:
print("Best Score: ", result.best_score)
print(result.best_params)

In [None]:
tabuler_model_best = result.best_model

In [None]:
tabuler_model

In [None]:
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)
tabular_model.fit(train=train, validation=val)

In [None]:
result = tabuler_model_best.evaluate(test)

In [None]:
pred_df = tabuler_model_best.predict(test)

In [None]:
pred_df

In [None]:
from sklearn.metrics import classification_report

In [None]:
report = classification_report(y_test, pred_df['prediction'])

Classification report of the best model.  
F1 score for each classes is more than 0.90.  
So the overall F1 score becomes 0.95
Recall score is more important in most cases therefore it is remarkable that each recall score is more than 0.90  

In [None]:
print(report)

# History summary and conclusion

|Features                                                                      |Layer            |Accuracy |F1 score |
|------------------------------------------------------------------------------|-----------------|--------:|--------:|
|binary columns + ram, battery_power, px_dimension                             |1024-512         |0.92     |0.92     |
|binary columns + ram, battery_power, px_dimension                             |512-256-128-64   |0.92     |0.92     |
|binary columns + ram, px_width, px_height, sc_h, sc_w, battery_power|512-256-128-64|0.91|0.91|
|PCA4                                                                |512-256       |0.92|0.92|
|binary columns + ram, px_width, px_height, sc_h, sc_w, battery_power|1024-512-128|0.95|0.95     |

With pytorch tabular packages, I could test several architectures and compare each models.  
Since my dataset has only 2000 records and they are balanced It is hard to drop some records.  
Additionally in correlation matrix and pairwise plot, I couldn't find outliers.  
Most features are uncorrelated each other, so it was not easy find patterns in graph.  

According to the classification report, If I could find or create feature that is helpful to identify pattern of class 2,  
Model could imporve more.  
