In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
print(os.listdir("../input/mobile-price-classification"))

In [None]:
data=pd.read_csv("../input/mobile-price-classification/train.csv")
data.info()

In [None]:
len(list(data.columns))

In [None]:
data.isnull().sum().max()

In [None]:
data.head()

In [None]:
data['price_range'].describe(), data['price_range'].unique()

## EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr=data.corr()
fig = plt.figure(figsize=(15,12))
r = sns.heatmap(corr)

Diagonal matrix meaning very low inter-feature correlation

In [None]:
corr.sort_values(by=["price_range"],ascending=False).iloc[0].sort_values(ascending=False)

High correlation between ram and price which means ram effects the price highly

### Plotting columns vs label for top 2

In [None]:
def plot_dist_(col1, col2):
    plt.figure()
    plt.clf()
    x_max = data[col1].max() + 0.5
    y_max = data[col2].max() + 0.5
    x_min = data[col1].min() - 0.5
    y_min = data[col2].min() - 0.5

    # Plot the training points
    plt.scatter(data[col1], data[col2], c=data[col1], cmap=plt.cm.Set1, edgecolor='k')
    plt.xlabel(col1.title())
    plt.ylabel(col2.title())

    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

In [None]:
plot_dist_("price_range", "ram")

In [None]:
plot_dist_("price_range", "battery_power")

## Preprocessing

In [None]:
y = data["price_range"].values
x_data=data.drop(["price_range"],axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_t = scaler.fit_transform(x_data)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_t,y,test_size = 0.2,random_state=1)

Split the data into train and test in 4:1 ratio

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
np.unique(y_test, return_counts=True)

Distribution of classes in both train and test balanced

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

In [None]:
possible_K = ['linear', 'poly', 'rbf', 'sigmoid']
possible_G = [.01,.05,.1,.25,.50,0.75,1]
model = None
best_score = 0
best_C = 0
best_K = None
scores = {}
best_G = None

for K in possible_K:
    if K!='linear': 
        K_scores = {}
        for G in possible_G:
            G_scores = []
            pc = 1
            counter = 0
            while pc <= 15:
                C = 2**pc
                pc+=1
                clf = SVC(kernel=K,C=C,gamma=G,probability=True).fit(x_train,y_train)
                scoretrain = clf.score(x_train,y_train)
                scoretest  = clf.score(x_test,y_test)
                G_scores.append(scoretest)
                if scoretest > best_score:
                    best_score = scoretest
                    best_C = C
                    model = clf
                    best_K = K
                    best_G = G
                if len(G_scores) > 2 and scoretest <= G_scores[-2]: #early stopper
                    if counter > 2:
                        break
                    counter+=1
            K_scores[G] = G_scores
    else:
        K_scores = []
        pc = 1
        counter = 0
        while pc <= 15:
            C = 2**pc
            pc+=1
            clf = SVC(kernel=K,C=C,probability=True).fit(x_train,y_train)
            scoretrain = clf.score(x_train,y_train)
            scoretest  = clf.score(x_test,y_test)
            K_scores.append(scoretest)
            if scoretest > best_score:
                best_score = scoretest
                best_C = C
                model = clf
                best_K = K
            if len(K_scores) > 2 and scoretest <= K_scores[-2]: #early_stopper
                if counter > 2:
                    break
                counter+=1
    scores[K] = K_scores

if best_K=='linear':
    print("Best SVM at C:{} with K:{} -> Test Score: {:2f} \n".format(best_C,best_K,best_score))
else:
    print("Best SVM at C:{} with K:{} and G:{} -> Test Score: {:2f} \n".format(best_C,best_K,best_G,best_score))

In [None]:
plt.plot(range(len(scores['linear'])),scores['linear'])
plt.title('Linear')
plt.show()

In [None]:
for K in possible_K:
    if K == 'linear':
        continue
    for G in scores[K].keys():
        plt.plot(range(len(scores[K][G])),scores[K][G])
        plt.title(str(K.title()) + ", Gamma :- " + str(G))
        plt.show()

In [None]:
plot_confusion_matrix(model, x_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score,StratifiedKFold
scores = cross_val_score(model,x_train,y_train,cv=5)
scores

High accuracy across all classes

In [None]:
scores.mean()

In [None]:
y_pred=model.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, clf.predict_proba(x_test), multi_class='ovr')

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, y_pred, average=None)

## Testing output

In [None]:
test_data=pd.read_csv("../input/mobile-price-classification/test.csv")
test_data.info()

In [None]:
ids = test_data["id"].values
x_test_data=test_data.drop(["id"],axis=1)
x_test_t = scaler.fit_transform(x_test_data)

In [None]:
y_pred_t=model.predict(x_test_t)

In [None]:
np.unique(y_pred_t, return_counts=True)

In [None]:
my_submission = pd.DataFrame({'Id': ids, 'SalePrice': y_pred_t})
my_submission.to_csv('submission.csv', index=False)