In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
from pylab import rcParams
from sklearn.naive_bayes import GaussianNB

rcParams['figure.figsize'] = 18, 8

Read .csv file and let's have a look at our data. Target column is 'price_range', plot it.

In [None]:
train = pd.read_csv('../input/mobile-price-classification/train.csv')
test = pd.read_csv('../input/mobile-price-classification/test.csv')
print(train.head())
sns.histplot(data=train, x="price_range")

As we see our data is well balanced. Now we will check the number of Nan values and column types in the dataset.

In [None]:
print(train.info())

Nan values were not found. Then we will plot some features distribution, for example top 6 most numerous features.

In [None]:
d = {}
for column in train.columns:
    d[column] = train[column].nunique()
print(d)
numerous_columns = list(dict(sorted(d.items(), key=lambda x: x[1], reverse=True)).keys())[:6]
print(numerous_columns)
f, axes = plt.subplots(2, 3)
for column, ax in zip(numerous_columns, axes.flatten()):
        sns.boxplot(x=train[column], ax=ax)



    


Now it's time to build corr matrix and watch dependency between features

In [None]:
corr = train.corr()
corr.style.background_gradient(cmap='coolwarm')

Correlation between features mostly low, big correlation(0.8 < corr < 1) only between target column and 'ram'. Our dataset is pretty good, but i would try to make it a little better by removing outliers( I hope not worse :)). For this purpose we will use z-normalization.

In [None]:
z_scores = stats.zscore(train)
abs_z_scores = np.abs(z_scores)
for column in abs_z_scores.columns:
    print(column + ' ' + ' min:' + str(abs_z_scores[column].min()) + ' median:' + str(abs_z_scores[column].median()) + ' max:' + str(abs_z_scores[column].max()))
filtered_entries = (abs_z_scores < 2.5).all(axis=1)
new_train = train[filtered_entries]
y = new_train['price_range']
X = new_train.drop(['price_range'], axis=1)



Now we start to train our ML models. We will use roc auc score and classification report metrics as our main metrics for the quality assessment of our models.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
estimator = []
clf = DecisionTreeClassifier(random_state=42)
params = {'max_depth': range(1,11), 'min_samples_split': range(2,11), 'min_samples_leaf': range(1,11)}
gcv = GridSearchCV(clf, params, n_jobs=-1)
gcv.fit(X_train, y_train)
best_gcv = gcv.best_estimator_
estimator.append(('dtc', best_gcv))
print(best_gcv)
print()
tree_predict_roc = best_gcv.predict_proba(X_test)
tree_predict = best_gcv.predict(X_test)
print('roc_auc_score:', roc_auc_score(y_test, tree_predict_roc, average="weighted", multi_class="ovr"))
print()
print(classification_report(y_test, tree_predict))


In [None]:
svc = SVC(random_state=42, probability=True)
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
gcv = GridSearchCV(svc, param_grid, n_jobs=-1)
gcv.fit(X_train, y_train)
best_gcv = gcv.best_estimator_
estimator.append(('svc', best_gcv))
print(best_gcv)
print()
svm_predict_roc = best_gcv.predict_proba(X_test)
svm_predict = best_gcv.predict(X_test)
print('roc_auc_score:', roc_auc_score(y_test, svm_predict_roc, average="weighted", multi_class="ovr"))
print()
print(classification_report(y_test, svm_predict))


Also we can try XGBClassifier method, but execute time of this algorithm is too long for multiclass classification.
[Read more!](https://towardsdatascience.com/xgboost-for-multi-class-classification-799d96bcd368)

In [None]:
'''xgb = XGBClassifier(objective='multi:softprob', nthread=4, eval_metric='mlogloss', random_state=42)
parameters = {'max_depth': range (2, 10, 1), 'n_estimators': range(100, 220, 40), 'learning_rate': [0.1, 0.01, 0.05]}
gcv = GridSearchCV(xgb, parameters, n_jobs=-1)
gcv.fit(X_train, y_train)
best_gcv = gcv.best_estimator_
estimator.append(('xgb', best_gcv))
print(best_gcv)
print()
xgb_predict_roc = best_gcv.predict_proba(X_test)
xgb_predict = best_gcv.predict(X_test)
print('roc_auc_score:', roc_auc_score(y_test, xgb_predict_roc, average="weighted", multi_class="ovr"))
print()
print(classification_report(y_test, xgb_predict))'''

In [None]:
param_grid_nb = { 'var_smoothing': np.logspace(0,-9, num=100)}
gcv = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, n_jobs=-1)
gcv.fit(X_train, y_train)
best_gcv = gcv.best_estimator_
estimator.append(('gauss', best_gcv))
print(best_gcv)
print()
gauss_predict_roc = best_gcv.predict_proba(X_test)
gauss_predict = best_gcv.predict(X_test)
print('roc_auc_score:', roc_auc_score(y_test, gauss_predict_roc, average="weighted", multi_class="ovr"))
print()
print(classification_report(y_test, gauss_predict))

We have trained three different ML models. For making our results even better let's try to do assembly of our models and get new results by voting.

In [None]:
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(X_train, y_train)
y_pred = vot_hard.predict(X_test)
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft.fit(X_train, y_train)
y_pred_roc = vot_soft.predict_proba(X_test)
print('roc_auc_score:', roc_auc_score(y_test, y_pred_roc, average="weighted", multi_class="ovr"))
print()
print(classification_report(y_test, y_pred))

Conclusion: best accuracy and best roc auc score we have in SVC method. When we tried to make an assemble, our metrics became lower because of low performance of other algorithms.