In [None]:
# Aided by the article at https://www.datatechnotes.com/2020/07/classification-example-with-linearsvm-in-python.html

# TODO:
# Import required libraries/packages
# Get the training and testing data 
#   Get Dataset paths
#   Read data into a dataframe
#   Find target
#   Remove rows with missing targets
#   Separate predictors from target
# Preprocess the data
#   Check for categorical data -> drop or OneHotEncode/Label them
#   Check for missing numerical values -> drop or Impute them
# Set apart validation set from training data
# Inspect data
# Train model
# Test model/make predictions

# Compare with other models

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
train_file_path = '../input/mobile-price-classification/train.csv'
test_file_path = '../input/mobile-price-classification/test.csv'

In [None]:
X_full = pd.read_csv(train_file_path)
X_test_full = pd.read_csv(test_file_path)

In [None]:
print(X_full.columns)

# price_range is the target

In [None]:
X_full.dropna(axis=0, subset=['price_range'], inplace=True)

In [None]:
y = X_full['price_range']
X_full.drop(['price_range'], axis=1, inplace=True)

In [None]:
X = X_full
X_test = X_test_full

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                     train_size=0.8,
                                                     test_size=0.2,
                                                     random_state=0,
                                                     stratify=y)
# note stratify=y is used for when your data's classes are imbalanced

In [None]:
print(X.describe())
print('-----------------------------------------------------------------')
print(X.head())
print('-----------------------------------------------------------------')
print(X.shape)
print('-----------------------------------------------------------------')
y.value_counts()

In [None]:
# All parameters are default except dual was set to False to avoid a converging error.
# A possible alternative could be to scale or normalize the training data.
lsvc = LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                 intercept_scaling=1, loss='squared_hinge', max_iter=1000,
                 multi_class='ovr', random_state=None, tol=0.0001,
                 verbose=0)

# Train the model and see its score.
lsvc.fit(X_train, y_train)
score = lsvc.score(X_train, y_train)
print(f'Score: {score}')

In [None]:
# Apply cross validation training and see the score
cv_scores = cross_val_score(lsvc, X_train, y_train, cv=10)
print(f'Average Cross Validation Score: {cv_scores.mean()}')

In [None]:
# 0 is low cost
# 1 is medium cost
# 2 is high cost
# 3 is very high cost

# Categorize an average spec smart phone cost:
# battery (mAh) = 1239
# has bluetooth or not (1 or 0) = 0
# microprocessor clock speed (GHz) = 1.5
# has dual sim support or not (1 or 0) = 1
# front camera Mpx = 4
# has 4G or not (1 or 0) = 1
# internal memory (Gbytes) = 32
# mobile depth in cm = 0.5
# weight of mobile phone (grams) = 140
# number of cores = 5
# primary camera Mpx = 10
# Pixel Resolution Height = 645
# Pixel Resolution Width  = 1251
# RAM (Mbytes) = 2124
# Screen Height of mobile in cm = 12.3
# Screen Width of mobile in cm = 5.8
# longest time that a single battery charge will last when you are talking (imputed) = 11
# Has 3G or not (1 or 0) = 1
# Has touch screen or not (1 or 0) = 1
# Has wifi or no (1 or 0) = 1


# we'll need to reshape this array since it is only one sample
average_sample = np.array([1239, 0, 1.5, 1, 4, 1, 32, 0.5, 140, 5, 10, 645, 1251, 2124, 12.3, 5.8, 11, 1, 1, 1])

cost_category = lsvc.predict(average_sample.reshape(1, -1))
print(f'Cost Category of a smart phone with average specs: {cost_category}')

# Conclusion
# I think this dataset is simply outdated. Even if you were to use sample specs in the
# prediction from 2017 (the year of the last dataset update), the model will still
# be a little inaccurate because the dataset is not representative. For instance,
# the max battery life of smart phones was 1999 which is well below the battery
# life of very costly phones at that time. But then again, nowhere does it explain when
# the data was recorded

In [None]:
# Aided by the article at https://medium.com/analytics-vidhya/evaluating-a-random-forest-model-9d165595ad56
# Lets try a RandomForestClassifier model
rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_valid)
score = accuracy_score(y_valid, y_pred)
print(f'Score: {score}')


In [None]:
# accuracy is not a great measure of classifier performance when the classes are imbalanced
# View confusion matrix for test data and predictions
# the diagnal is the number of correctly predicted instances of a class (col 1 is class 0, col 2 -> class 1, etc)
confusion_matrix(y_valid, y_pred)

In [None]:
# using the confusion matrix and the classification report gives us a good understanding of how
# well the model is predicting the instances correctly and what classes it is struggling to identify
print(classification_report(y_valid, y_pred))

In [None]:
# Lets compare the above models with Extreme Gradient Boosting
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4, random_state=0)
xgb.fit(X_train, y_train,
       early_stopping_rounds=5,
       eval_set=[(X_valid, y_valid)],
       verbose=False)


In [None]:
y_preds = xgb.predict(X_valid)
score = accuracy_score(y_valid, y_preds)
print(f'Model Accuracy Score: {score}')

In [None]:
confusion_matrix(y_valid, y_pred)

In [None]:
print(classification_report(y_valid, y_pred))

In [None]:
# Now lets build a vanilla Neural Network in PyTorch
# Aided by the guides:
#   https://curiousily.com/posts/build-your-first-neural-network-with-pytorch/
#   https://www.youtube.com/watch?v=Jy4wM2X21u0&t=185s

# Objectives:
#   Preprocess CSV files and convert the data to Tensors
#   Build a Neural Network model with PyTorch
#   Use a loss function and an optimizer to train the model
#   Evaluate the model

import torch
import torch.nn as nn
import torch.optim as optim                 # optimization algorithms like gradient descent
import torch.nn.functional as F             # for activation funtions
import torchvision.transforms as transforms # transformations to perform on dataset

from sklearn.preprocessing import StandardScaler

seed = 0
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
# Create fully connected Neural Network
class NN(nn.Module):
    def __init__(self, n_features, num_classes):
        super(NN, self).__init__()
        # two hidden layers 25 to 15 nodes
        self.fc1 = nn.Linear(n_features, 25) 
        self.fc2 = nn.Linear(25, 30)            
        self.fc3 = nn.Linear(30, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
# test it out

# model = NN(20, 4)
# x = torch.randn(100, 20)
# print(model(x).shape)

In [None]:
# Load the data
train_file_path = '../input/mobile-price-classification/train.csv'
test_file_path = '../input/mobile-price-classification/test.csv'

X_full = pd.read_csv(train_file_path)
X_full.dropna(axis=0, subset=['price_range'], inplace=True)

y = X_full['price_range']
X_full.drop(['price_range'], axis=1, inplace=True)
X = X_full


# note stratify=y is used for when your data's classes are imbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                     train_size=0.8,
                                                     test_size=0.2,
                                                     random_state=seed,
                                                     stratify=y)


# Normalize data here after train_test_split
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))




# Convert data to tensors
X_train = torch.from_numpy(X_train.to_numpy()).float()
y_train = torch.squeeze(torch.from_numpy(y_train.to_numpy()).long())

X_test = torch.from_numpy(X_test.to_numpy()).float()
y_test = torch.squeeze(torch.from_numpy(y_test.to_numpy()).long())

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# Hyper-parameters
n_features = X_train.shape[1]
num_classes = 4
learning_rate = 0.009
num_epochs = 1200

In [None]:
# Initialize network
model = NN(n_features=n_features, num_classes=num_classes)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Set device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = X_train.to(device)
y_train = y_train.to(device)

X_test = X_test.to(device)
y_test = y_test.to(device)

model = model.to(device)

criterion = criterion.to(device)

In [None]:
# Train the model
for epoch in range(num_epochs):
    
    # forward
    y_pred = model(X_train)
    train_loss = criterion(y_pred, y_train)
    
    # backward
    optimizer.zero_grad()
    train_loss.backward()
    
    # gradient descent or adam step
    optimizer.step()

In [None]:
# Check accuracy on training and test
def check_accuracy(X_train, y_train, model):
    num_correct = 0
    num_samples = 0
    model.eval()
    
    with torch.no_grad():
        scores = model(X_train)
        _, predictions = scores.max(1)
        num_correct += (predictions == y_train).sum()
        num_samples += predictions.size(0)
        print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')
        
    model.train()


check_accuracy(X_train, y_train, model)
check_accuracy(X_test, y_test, model)