### Variable Information

    pc	          Primary Camera mega pixels
    fc	          Front Camera mega pixels
    sc_h	        Screen Height of mobile in cm
    sc_w	        Screen Width of mobile in cm
    m_dep	       Mobile Depth in cm
    px_width	    Pixel Resolution Width
    px_height	   Pixel Resolution Height
    ram	         Random Access Memory in Mega Bytes
    int_memory	  Internal Memory in Giga Bytes
    four_g	      Has 4G or not
    three_g	     Has 3G or not
    dual_sim	    Has dual sim support or not
    battery_power   Total energy a battery can store in one time measured in mAh
    touch_screen	Has touch screen or not
    clock_speed	 Speed at which microprocessor executes instructions
    n_cores	     Number of cores of processor
    wifi	        Has wifi or not
    blue	        Has bluetooth or not
    mobile_wt	   Weight of mobile phone
    talk_time	   Longest time that a single battery charge will last when you are
    price_range   This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).

## Contents of the notebook
        1. Importing libraries
        2. Data Exploration and simple visualisations
        3. Missing value/ data collection error check
        4. Variable skewness check and treatment if required
        5. Multicollinearity check
        6. Preparing list of models to train
        7. Create pipelines for data preprocessing
        8. Compare results of various classification algorithms
        9. Creating a submission file for test data
        10. Interpretation of model using SHAP

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, PolynomialFeatures
from category_encoders import WOEEncoder, BinaryEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
train = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")
test = pd.read_csv("/kaggle/input/mobile-price-classification/test.csv")
sub = pd.DataFrame(test["id"])
sub["price_range"] = 2
test.drop("id", axis=1, inplace=True)
print(f"train data :{train.shape} test data :{test.shape}")

In [None]:
train.head()

In [None]:
test.head()

### Creating categorical and continuous variable list

In [None]:
cat_var = ["blue","dual_sim","four_g","three_g","touch_screen","wifi"]
con_var = ['px_height', 'sc_h', 'sc_w', 'clock_speed', 'battery_power', 'int_memory', 'talk_time', 'pc',
           'n_cores', 'px_width', 'fc', 'mobile_wt', 'm_dep', 'ram']

In [None]:
def con_plot(var):
    fig, ax = plt.subplots(int(np.ceil(len(con_var)/3)), 3, figsize=(16,16))
    ax = ax.flatten()
    i = 0
    for col in var:
        skew = train[col].skew()
        sns.distplot(train[col], fit = stats.norm, ax=ax[i])
        ax[i].set_title("Variable %s skew : %.4f"%(col, skew))
        i+=1
    plt.tight_layout()
    plt.show()
    
con_plot(con_var)

In [None]:
def cat_plot(var):
    fig, ax = plt.subplots(int(np.ceil(len(var)/3)), 3, figsize=(16,8))
    ax = ax.flatten()
    i = 0
    for col in var:
        sns.countplot(train[col], ax=ax[i])
        ax[i].set_title("devices in each category for %s"%(col))
        i+=1
    plt.tight_layout()
    plt.show()
    
cat_plot(cat_var)

### Missing value check

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

### Device Count in each class

In [None]:
train.price_range.value_counts().plot(kind='bar')
plt.show()

In [None]:
train.skew()

In [None]:
sns.pairplot(train, hue='price_range', diag_kind='hist')
plt.show()

    we can see from pairplot that RAM and battery power can help in classification

### Separating features and target

In [None]:
X = train.drop(["price_range"], axis=1)
Y = train["price_range"]

### Checking for multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
vif_val = pd.DataFrame({"Col":X.columns})
vif_val["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
vif_val

### Bulding list of models to be trained

In [None]:
model_rf = RandomForestClassifier(random_state=1, n_jobs=-1)
model_logr = LogisticRegression(random_state=1, n_jobs=-1, multi_class='multinomial')
model_lgbm = LGBMClassifier(random_state=1, n_jobs=-1)
model_xgb = XGBClassifier(random_state=1, n_jobs=-1)
model_gbr = GradientBoostingClassifier(random_state=1)
model_cat = CatBoostClassifier(random_state=1, verbose=0)

models = []
models.append(('LR',model_logr))
models.append(('RF',model_rf))
models.append(('GBR',model_gbr))
models.append(('XGB',model_xgb))
models.append(('LGB',model_lgbm))
models.append(('CAT',model_cat))

### Preparing Pipeline Steps

In [None]:
scaler = StandardScaler()
onehot = OneHotEncoder(handle_unknown='ignore', sparse=False)
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
feature = SelectFromModel(model_rf, threshold=0.001)
ct = ColumnTransformer([('onehot', onehot, cat_var),
                        ('scaler', scaler, con_var)], remainder='passthrough', n_jobs=-1)

In [None]:
results = []
names = []
for name, model in models:
    #pipe = Pipeline([('ct', ct), ('fselect', feature), (name, model)]) # including feature selection step using RF
    pipe = Pipeline([('ct', ct), (name, model)])
    scores = cross_val_score(pipe, X, Y, scoring='accuracy', cv=cv, n_jobs=-1, verbose=0)
    names.append(name)
    results.append(scores)
    print("model %s accuracy: %.4f variance: %.4f"%(name, np.mean(scores), np.std(scores)))

In [None]:
plt.figure(figsize=(12,5))
plt.boxplot(results)
plt.xticks(np.arange(1,len(names)+1),names)
plt.title("Accuracy for different machine learning algorithms")
plt.xlabel("Model Name")
plt.ylabel("Cross val Accuracies")
plt.show()

### Training Logistic for checking performance and creating a submission file

In [None]:
logr_pipe = Pipeline([('ct', ct), ('LR', model_logr)])
logr_pipe.fit(X, Y)
trainpred = logr_pipe.predict(X)

In [None]:
print(classification_report(Y, trainpred))

In [None]:
prediction = logr_pipe.predict(test)

In [None]:
def submission(prediction, model):
    sub["price_range"] = prediction
    sub.price_range.value_counts()
    sub.to_csv("model_"+model+"_mobile_price.csv", index=False)

In [None]:
submission(prediction, 'logr')

### Getting list of new features after transformation

In [None]:
onehot_categories = logr_pipe.named_steps['ct'].transformers_[0][1].categories_
onehot_features = [f"{col}__{val}" for col, vals in zip(cat_var, onehot_categories) for val in vals]
all_features = onehot_features + con_var
print(all_features)

### creating a dataframe for the coefficients

In [None]:
coeff = pd.DataFrame(logr_pipe['LR'].coef_, columns=all_features)
coeff.T

### Model interpretation using Shap

In [None]:
import shap
pd.set_option("display.max_columns",None)
shap.initjs()
import xgboost
import eli5

### Linear Explainer for Logistic Regression

In [None]:
ct.fit(X)
X_shap = ct.fit_transform(X)
test_shap  = ct.transform(test)
explainer = shap.LinearExplainer(logr_pipe.named_steps['LR'], X_shap, feature_perturbation="interventional")
shap_values = explainer.shap_values(test_shap)

In [None]:
shap.summary_plot(shap_values, test_shap, feature_names=all_features)

### we can clearly see that only four variables are very important and influencing the class prediction, while rest of the variables have no importance
    - ram
    - battery power
    - px width
    - px height

In [None]:
# prediction class 2, shap values for class 2
shap.force_plot(explainer.expected_value[2], shap_values[2][2], test_shap[2], feature_names=all_features)

In [None]:
# prediction class 2, shap values for class 3
shap.force_plot(explainer.expected_value[3], shap_values[3][2], test_shap[2], feature_names=all_features)

In [None]:
# prediction class 0, shap values for class 0
shap.force_plot(explainer.expected_value[0], shap_values[0][997], test_shap[997], feature_names=all_features)

In [None]:
# prediction class 0, shap values for class 3
shap.force_plot(explainer.expected_value[3], shap_values[3][997], test_shap[997], feature_names=X.columns)

### We can straight away see that shap value model interpretability is very effective. It explains the variable contribution in additive sense which is easier to grasp and also shows which variables are influencing the decision.

### Thanks for viewing my work. If you like it dont forget to upvote it.