In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import plotly.express as px
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df_train = pd.read_csv("../input/mobile-price-classification/train.csv")
df_test = pd.read_csv("../input/mobile-price-classification/test.csv")

# Exploratory Data Analysis

In [None]:
df_train.shape #2000 records with 20 features.
#(This really reminds me of the markstrat game in business school)
df_test.shape #1000 records 20 features with ID. 

In [None]:
#Lets take a look of battery power in different price ranges
import plotly.express as px
fig = px.histogram(df_train["battery_power"], color = df_train["price_range"], width=600, height=400)
fig.update_layout(yaxis_range=[0,80])
fig.show()
#Cannot tell much but class 3 does have larger battery power. 

In [None]:
#Lets take a look of bluetooth feature in different price ranges
df_blue = df_train.groupby(["price_range","blue"])["blue"].count()
pd.DataFrame(df_blue)
#I dont think I can see much correlation between bluetooth and price ranges. 

In [None]:
#Similarly, lets take a look on all the features. 
df_cs = df_train.groupby(["price_range"],as_index=False).mean()
#Most values are really close. px_height and px_width are important features. 
#Normalization is definitely necessary for related models. 
df_cs

In [None]:
#Correlation plot
#Beautiful Correlation Plot
from string import ascii_letters

sns.set(style="white")

# Generate a large random dataset
rs = np.random.RandomState(33)

# Compute the correlation matrix
corr = df_train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### No strong correlation was found.
### Weak correlations found between target and the following vars: battery power, px height and wildth,  ram. 


# Machine Learning Modeling

In [None]:
num_columns = []
for i in df_train.columns:
    if df_train[i].nunique()>=3 and i!="price_range":
        num_columns.append(i)

In [None]:
import warnings
warnings.filterwarnings("ignore")

from scipy import stats
from sklearn.model_selection import train_test_split

# 20 features with 2000 records, multi-classification problem with low Collinearity, 
# the suitable models came in my mind are RF, Logistic Regression, XGBoost, SVM and GBM.

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import plot_tree, plot_importance

#CrossValidation and Metrics
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

df_train[num_columns] = stats.zscore(df_train[num_columns])
X = df_train.drop("price_range", axis=1)
y = df_train["price_range"]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=42)


cro_val_acc, train_scores, test_scores = [], [], []


model_names = ["RandomForestClassifier", "LogisticRegression", "XGBoost","SVM","GradientBoostingClassifer"]
models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "XGBClassifier": XGBClassifier(verbosity = 0),
    "SVM": SVC(),
    "GradientBoostingClassifer": GradientBoostingClassifier()
}

for i in models:
    print(i)
    print("\n")
    
    model = models[i]
    cv = KFold(n_splits=5, shuffle=True)
    cv_scores = cross_val_score(model, X, y, scoring="accuracy", cv=cv)
    cv_mean_score = np.mean(cv_scores)
    print(i,"  ","cross validation accuracy", cv_mean_score)
    cro_val_acc.append(cv_mean_score)
    
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    train_scores.append(train_score)
    print(f"Train Score:{train_score*100}")
    print("\n")
    
    test_score = model.score(X_test, y_test)
    print(f"Test Score:{test_score*100}")
    print("\n")
    test_scores.append(test_score)
    
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_pred, y_test)
    print(conf_matrix)
    print("\n")
    
    cla_report = classification_report(y_test, y_pred, output_dict=True)
    print(pd.DataFrame(cla_report).transpose())
    print("\n")

# Model Performance Analysis

#### Tree based models all have overfitting problems

In [None]:
import seaborn as sns
plt.figure(figsize=(16,8))
sns.set_style('darkgrid')
plt.title('Model Performance', fontweight='bold', size=20)

barWidth = 0.20
 
b1 = train_scores
b2 = test_scores
b3 = cro_val_acc
 
r1 = np.arange(len(b1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
 
plt.bar(r1, b1, color='blue', width=barWidth, edgecolor='white', label='train',capsize=18)
plt.bar(r2, b2, color='red', width=barWidth, edgecolor='white', label='test',capsize=18)
plt.bar(r3, b3, color='grey', width=barWidth, edgecolor='white', label='cv_accuracy',capsize=18)

plt.ylim([0.8,1])
 

plt.xlabel('Models', fontweight='bold', size = 20)
plt.ylabel('Scores', fontweight='bold', size = 20)
plt.xticks([r + barWidth for r in range(len(b1))], model_names)
 
plt.legend()
plt.show()

In [None]:
for i in range(len(model_names)):
    print(f'Accuracy {model_names[i]}'.ljust(80, ' '))
    print(round(test_scores[i],3))

Logistic Regression performs the best. 
The rest of the models have overfitting problems. 

# Hyperparameter Tuning for XGBoost

In [None]:
# Lets give XGBoost another try with Hyperparater Tuning
from sklearn.model_selection import GridSearchCV

params = {
    'learning_rate': [0.02, 0.1],
    'max_depth': [5, 8, 11],
    'colsample_bytree': [0.7],
    'n_estimators' : [500, 800,1000],
    'objective': ['multi:softmax']
}

xgb_model = XGBClassifier(verbosity = 0)

clf = GridSearchCV(estimator = xgb_model,
                       param_grid = params,
                       cv=5, 
                       scoring = "accuracy")
clf.fit(X_train, y_train)
clf.best_params_

In [None]:
xgb_model = XGBClassifier(colsample_bytree=0.7,
 learning_rate=0.1,
 max_depth=5,
 n_estimators=1000,
 verbosity = 0)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)
#Not much improvement. 

In [None]:
from xgboost import plot_importance
plot_importance(xgb_model, max_num_features=10)

# AutoML comparison

In [None]:
h2o.cluster().shutdown()

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init()
h2o_df = h2o.H2OFrame(df_train)
h2o_df["price_range"] = h2o_df["price_range"].asfactor()
train, test = h2o_df.split_frame(ratios=[.7])
# Identify predictors and response
x = train.columns
y = "price_range"
x.remove(y)

In [None]:
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

aml = H2OAutoML(max_runtime_secs=600,
                exclude_algos=['DeepLearning'],
                seed=1,
                stopping_metric='mean_per_class_error',
                sort_metric='mean_per_class_error',
                project_name='Price_Range_Prediction'
)

%time aml.train(x=x, y=y, training_frame=train)
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(5) 


In case H2O bug comes up, I copied the result here. 

StackedEnsemble_BestOfFamily_AutoML_20210506_034057	0.0476932
GLM_1_AutoML_20210506_034057	0.0484978	0.185706	0.220634	0.0486792	nan	nan


StackedEnsemble_AllModels	0.0906175	0.383616	0.340883	0.116201	nan	nan

GBM_grid__1_1	0.0954304	0.242366	0.267777	0.0717045	nan	nan

XGBoost_grid__1_0.0962082	0.238954	0.267441	0.0715246	nan	nan

	0	1	2	3	Error	Rate
    
0  	147.0	3.0	0.0	0.0	0.020000	3 / 150

1  	7.0	148.0	1.0	0.0	0.051282	8 / 156

2  	0.0	4.0	133.0	6.0	0.069930	10 / 143

3  	0.0	0.0	0.0	148.0	0.000000	0 / 148

4  	154.0	155.0	134.0	154.0	0.035176	21 / 597

In [None]:
model_id = aml.leader.model_id
model = h2o.get_model(model_id)
perf = model.model_performance(test)
perf.confusion_matrix()