In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import plotly.express as px
%matplotlib inline
import seaborn as sns
import time
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Import and first look at our train_data
data_train = pd.read_csv("../input/mobile-price-classification/train.csv")
data_train.head()

##   **This line of code let us find out our data distribution, distribution of each variable; also we can find out if we have gaps in our dataset.**

In [None]:
data_train.profile_report()

## **Decriptive statistics** 

In [None]:
desc = data_train.describe().T
df1 = pd.DataFrame(index=['battery_power', 'blue', 'clock_speed', 'dual_sim',
                          'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 
                          'n_cores', 'pc', 'px_height', 'px_width', 'ram', 
                          'sc_h', 'sc_w', 'talk_time', 'three_g','touch_screen',
                          'wifi', 'price_range'], 
                   columns= ["count","mean","std","min",
                             "25%","50%","75%","max"], data= desc )

f,ax = plt.subplots(figsize=(10,10))

sns.heatmap(df1, annot=True,cmap = "rocket_r", fmt= '.0f',
            ax=ax,linewidths = 3,linecolor = 'cornflowerblue', cbar = False,
            annot_kws={"size": 16})
plt.xticks(size = 18)
plt.yticks(size = 12, rotation = 0)
plt.ylabel("Variables", fontsize = 20 )
plt.title("Descriptive Statistics", fontsize = 20)
plt.show()

## **Visualization of binary features**

In [None]:
fig,((ax0, ax1,ax2), (ax3, ax4,ax5)) = plt.subplots(nrows=2, ncols=3)
fig.set_figheight(10)
fig.set_figwidth(10)
 
index = [0,1]
values = [sum(data_train['blue'] == 0),sum(data_train['blue'] == 1)]
ax0.bar(index,values)
ax0.set_title('Bluetooth',fontsize=20)
ax0.xaxis.set_major_locator(MaxNLocator(integer=True))


values2 = [sum(data_train['dual_sim'] == 0),sum(data_train['dual_sim'] == 1)]
ax1.bar(index,values2,color = 'black')
ax1.set_title('dual_sim',fontsize=20)
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))

values3 = [sum(data_train['four_g'] == 0),sum(data_train['four_g'] == 1)]
ax2.bar(index,values3,color = 'red')
ax2.set_title('4G',fontsize=20)
ax2.xaxis.set_major_locator(MaxNLocator(integer=True))


values4 = [sum(data_train['three_g'] == 0),sum(data_train['three_g'] == 1)]
ax3.bar(index,values4,color = 'green')
ax3.set_title('3G',fontsize=20)
ax3.xaxis.set_major_locator(MaxNLocator(integer=True))

values5 = [sum(data_train['touch_screen'] == 0),sum(data_train['touch_screen'] == 1)]
ax4.bar(index,values5,color = 'purple')
ax4.set_title('touch_screen',fontsize=20)
ax4.xaxis.set_major_locator(MaxNLocator(integer=True))


values6 = [sum(data_train['wifi'] == 0),sum(data_train['wifi'] == 1)]
ax5.bar(index,values6)
ax5.set_title('WiFi',fontsize=20)
ax5.xaxis.set_major_locator(MaxNLocator(integer=True))

 
fig.tight_layout()
plt.show()

## **Visualization of other variables**

In [None]:
fig = px.histogram(data_train,x = 'battery_power',
                  title = 'Battery_power',
                  color_discrete_sequence = ['royalblue'])
fig.update_layout(
xaxis_title_text = 'battery_power',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'int_memory',
                  title = 'Internal Memory in Gigabytes',
                  color_discrete_sequence = ['limegreen'])
fig.update_layout(
xaxis_title_text = 'Internal Memory in Gigabytes',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'ram',
                  title = 'RAM',
                  color_discrete_sequence = ['goldenrod'])
fig.update_layout(
xaxis_title_text = 'RAM',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'm_dep',
                  title = 'Mobile Depth in cm ',
                  color_discrete_sequence = ['salmon'])
fig.update_layout(
xaxis_title_text = 'Mobile Depth in cm',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'mobile_wt',
                  title = 'Weight of mobile phone ',
                  color_discrete_sequence = ['gold'])
fig.update_layout(
xaxis_title_text = 'Weight of mobile phone',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'clock_speed',
                  title = 'Speed at which microprocessor executes instructions ',
                  color_discrete_sequence = ['deeppink'])
fig.update_layout(
xaxis_title_text = 'clock_speed',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'fc',
                  title = 'Front Camera mega pixels',
                  color_discrete_sequence = ['mediumpurple'])
fig.update_layout(
xaxis_title_text = 'Front Camera mega pixels',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'pc',
                  title = 'Primary Camera mega pixels ',
                  color_discrete_sequence = ['darkcyan'])
fig.update_layout(
xaxis_title_text = 'Primary Camera mega pixels',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'n_cores',
                  title = 'Number of cores of processor ',
                  color_discrete_sequence = ['darkkhaki'])
fig.update_layout(
xaxis_title_text = 'Number of cores of processor ',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'px_height',
                  title = 'Pixel Resolution Height ',
                  color_discrete_sequence = ['cyan'])
fig.update_layout(
xaxis_title_text = 'px_height',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'px_width',
                  title = 'Pixel Resolution Width',
                  color_discrete_sequence = ['crimson'])
fig.update_layout(
xaxis_title_text = 'px_width',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'sc_h',
                  title = 'Screen Height of mobile in cm',
                  color_discrete_sequence = ['orangered'])
fig.update_layout(
xaxis_title_text = 'sc_h',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'sc_w',
                  title = 'Screen Width of mobile in cm',
                  color_discrete_sequence = ['slateblue'])
fig.update_layout(
xaxis_title_text = 'sc_w',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'talk_time',
                  title = 'Longest time that a single battery charge will last when you are ',
                  color_discrete_sequence = ['navy'])
fig.update_layout(
xaxis_title_text = 'talk_time',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

In [None]:
fig = px.histogram(data_train,x = 'price_range',
                  title = 'Price_range',
                  color_discrete_sequence = ['green'])
fig.update_layout(
xaxis_title_text = 'price_range',
yaxis_title_text = 'Frequency',
bargap = 0.25, showlegend = False,
             autosize=False,
             width=750,
             height=450)

## **Correlation gap**

In [None]:
matrix = np.triu(data_train.corr())
sns.set_style("white")
f,ax=plt.subplots(figsize = (16,16))
sns.heatmap(data_train.corr(),annot= True,fmt = ".2f",ax=ax,
            vmin = -1,
            vmax = 1, mask = matrix,cmap = "coolwarm",
            linewidth = 0.2,linecolor = "white")
plt.xticks(rotation=70)
plt.yticks(rotation=0)
plt.title('Correlation Map', size = 14)
plt.show()

## **Correlation between RAM and price range**

In [None]:
fig = px.box(data_train, x="price_range", y="ram",color="price_range",width=750,
             height=390)
fig.show()

## **Correlation between battery power and price range**

In [None]:
fig = px.box(data_train, x="price_range", y="battery_power",color="price_range",width=750,
             height=450)
fig.show()

In [None]:
X_train = data_train.drop(['price_range'], axis=1)
Y_train = data_train['price_range']

## **Splitting train data on train set and valid set**

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train , Y_train,test_size=0.25,random_state=52)
X_train.shape

## **Data normalization**

In [None]:
norm = preprocessing.MinMaxScaler()
norm.fit(X_val)
X_val_norm = norm.transform(X_val)
X_val_norm= pd.DataFrame(X_val_norm, index=X_val.index, columns=X_val.columns)
X_val_norm

In [None]:
norm = preprocessing.MinMaxScaler() 
norm.fit(X_train)
X_train_norm = norm.transform(X_train)
X_train_norm= pd.DataFrame(X_train_norm, index=X_train.index, columns=X_train.columns)
X_train_norm

## **SVM klassifier with GRID search**

### ***Linear function***

In [None]:
start_time = time.time()
parameters = {'kernel': ['linear'],'C': [1,5,10,15,20,50,100,500,1000] }
model = svm.SVC()
grid = GridSearchCV(model, parameters)
grid = GridSearchCV(estimator=model,
             param_grid=parameters)
grid.fit(X_train_norm, Y_train)
print("Parametres of the best model:")
print(grid.best_params_)
print("Train accuracy:")
grid_predictions_train = grid.predict(X_train_norm)
print((metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = grid.predict(X_val_norm)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a validation set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

### ***Polynomial function***

In [None]:
start_time = time.time()
parameters = {'kernel': ['poly'], 'degree':range(1,20),'C': [1,5,10,15,20,50,100,500,1000] }
model = svm.SVC()
grid = GridSearchCV(model, parameters)
grid = GridSearchCV(estimator=model,
             param_grid=parameters)
grid.fit(X_train_norm, Y_train)
print("Parametres of the best model:")
print(grid.best_params_)
print("Train accuracy:")
grid_predictions_train = grid.predict(X_train_norm)
print((metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = grid.predict(X_val_norm)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a validation set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

### ***Sigmoid function***

In [None]:
start_time = time.time()
gamma = np.arange(0.001,1,0.01)
gamma = gamma.tolist()
parameters = {'kernel': ['sigmoid'], 'gamma':  gamma, 'C': [1,5,10,15,20,50,100,500,1000] }
model = svm.SVC()
grid = GridSearchCV(model, parameters)
grid = GridSearchCV(estimator=model,
             param_grid=parameters)
grid.fit(X_train_norm, Y_train)
print("Parametres of the best model:")
print(grid.best_params_)
print("Train accuracy:")
grid_predictions_train = grid.predict(X_train_norm)
print((metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = grid.predict(X_val_norm)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a validation set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

### ***Gaussian function***

In [None]:
start_time = time.time()
gamma = np.arange(0.001,1,0.01)
gamma = gamma.tolist()
parameters = {'kernel': ['rbf'], 'gamma':  gamma, 'C':[1,5,10,15,20,50,100,500,1000]  }
model = svm.SVC()
grid = GridSearchCV(model, parameters)
grid = GridSearchCV(estimator=model,
             param_grid=parameters)
grid.fit(X_train_norm, Y_train) 
print("Parametres of the best model:")
print(grid.best_params_)
print("Train accuracy:")
grid_predictions_train = grid.predict(X_train_norm)
print((metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = grid.predict(X_val_norm)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a validation set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

## **KNN classifier**

In [None]:
start_time = time.time()
neignbors = range(1,100,1)
parameters = {'n_neighbors': neignbors}

model = KNeighborsClassifier()
grid = GridSearchCV(model, parameters)
grid = GridSearchCV(estimator=model,
             param_grid=parameters)
grid.fit(X_train, Y_train) 
print("Parametres of the best model:")
print(grid.best_params_)
print("Train accuracy:")
grid_predictions_train = grid.predict(X_train)
print((metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = grid.predict(X_val)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a validation set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

## **Bayessian classifier**

In [None]:
start_time = time.time()
model = GaussianNB(priors = [0.25, 0.25, 0.25, 0.25])
model.fit(X_train_norm,Y_train)
print("Train accuracy:")
grid_predictions_train = model.predict(X_train_norm)
print("Accuracy:",(metrics.accuracy_score(Y_train, grid_predictions_train))*100,"%")
print("Validation accuracy:")
grid_predictions_val = model.predict(X_val_norm)
print((metrics.accuracy_score(Y_val, grid_predictions_val))*100,"%")
print("Time:")
print("--- %s seconds ---" % (time.time() - start_time))

### **Cross-validation matrix of a train set**

In [None]:
conf_matrix_train=metrics.confusion_matrix(Y_train,grid_predictions_train)
conf_matrix_train = pd.DataFrame(conf_matrix_train, index=grid.classes_, columns=grid.classes_)
conf_matrix_train

### **Cross-validation matrix of a valid set**

In [None]:
conf_matrix_val=metrics.confusion_matrix(Y_val,grid_predictions_val)
conf_matrix_val = pd.DataFrame(conf_matrix_val, index=grid.classes_, columns=grid.classes_)
conf_matrix_val

## **Making forecast**

In [None]:
data_test = pd.read_csv("../input/mobile-price-classification/test.csv")
data_test.info()

### **Test data normalization**

In [None]:
data_test_SVM = data_test.drop(['id'], axis=1)
norm = preprocessing.MinMaxScaler() 
norm.fit(data_test_SVM)
data_test_norm = norm.transform(data_test_SVM)
data_test_norm = pd.DataFrame(data_test_norm, index=data_test_SVM.index, columns=data_test_SVM.columns)
data_test_norm

In [None]:
print("Our test dataset distribution:")
data_test_norm.shape

## ***SVM forecast***

In [None]:
model = svm.SVC(kernel = 'linear', C = 500)
model.fit(X_train_norm, Y_train)
Price_predict = model.predict(data_test_norm)
data_test_SVM["price_range"] = Price_predict.tolist()
data_test_SVM['id'] = data_test['id']
cols = data_test_SVM.columns.tolist()
cols = [cols[-1]]+cols[:-1] # or whatever change you need
data_test_SVM = data_test_SVM.reindex(columns=cols)
data_test_SVM[['id', 'price_range']]

## **KNN forecast**

In [None]:
data_test_KNN = data_test.drop(['id'], axis=1)
model = KNeighborsClassifier( n_neighbors=  29)
model.fit(X_train, Y_train)
Price_predict = model.predict(data_test_KNN)
data_test_KNN["price_range"] = Price_predict.tolist()
data_test_KNN['id'] = data_test['id']
cols = data_test_KNN.columns.tolist()
cols = [cols[-1]]+cols[:-1]  
data_test_KNN = data_test_KNN.reindex(columns=cols)
data_test_KNN[['id', 'price_range']]

## **Bayes forecast**

In [None]:
data_test_bayes = data_test.drop(['id'], axis=1)
model = GaussianNB(priors = [0.25, 0.25, 0.25, 0.25])
model.fit(X_train_norm,Y_train)
Price_predict = model.predict(data_test_norm)
data_test_bayes["price_range"] = Price_predict.tolist()
data_test_bayes['id'] = data_test['id']
cols = data_test_bayes.columns.tolist()
cols = [cols[-1]]+cols[:-1]  
data_test_bayes = data_test_bayes.reindex(columns=cols)
data_test_bayes[['id', 'price_range']]

In [None]:
#Stacking all forecasts into one dataset
data_test_SVM["price_range_KNN"] = data_test_KNN["price_range"]
data_test_SVM["price_range_bayes"] = data_test_bayes["price_range"]
data_test_SVM.head()

In [None]:
# Checking how many IDs have equal forecasts for all 3 methods
len(data_test_SVM[(data_test_SVM['price_range'] == data_test_SVM['price_range_KNN']) & (data_test_SVM['price_range'] == data_test_SVM['price_range_bayes'])])

# **Conclusion:**
## The most efficient method for mobile price range classifying - SVM with linear function as it shows good accuracy and model training takes about 6 seconds. Overall, all 3 methods shows equal results in 755 IDs out of 1000 which let us rely on our forecasts.