In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

'''    
**Model will predict a price range, indicating how high the price is, using K-Nearest Neighbors algorithm.**

    
**Column Description**

battery_power         Total energy a battery can store in one time measured  in mAh
clock_speed           The speed at which microprocessor executes instructions
fc                    Front Camera megapixels
int_memory            Internal Memory in Gigabytes
m_dep                 Mobile Depth in cm
mobile_wt             Weight of the mobile phone
n_cores               Number of cores of a processor
pc                    Primary Camera megapixels
px_height             Pixel Resolution Height
px_width              Pixel Resolution Width
ram                   Random Access Memory in MegaBytes
sc_h                  Screen Height of mobile in cm
sc_w                  Screen Width of mobile in cm
talk_time             The longest time that a single battery charge will last 
price_range           This is the target variable with the value of 
                   
                       0(lowcost), 
                       1(medium cost),
                       2(high cost) and
                       3(very high cost).
                    
'''

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('../input/mobile-price-classification/train.csv')
df.head()
## reading the csv file


In [None]:
sns.pairplot(df,hue='price_range')

In [None]:
#Battery power vs Price Range

In [None]:
sns.boxplot(x="price_range", y="battery_power", data=df)

In [None]:
# No of Phones vs Camera megapixels of front and primary camera
plt.figure(figsize=(10,6))
df['fc'].hist(alpha=0.5,color='blue',label='Front camera')
df['pc'].hist(alpha=0.5,color='red',label='Primary camera')
plt.legend();
plt.xlabel('MegaPixels')

In [None]:
d1=df.copy()

d1
## copying data into another DataFrame 

In [None]:
d1.corr()['price_range']

## checking out correlation

In [None]:
corr_features =[]

for i , r in df.corr().iterrows():
    k=0
    for j in range(len(r)):
        if i!= r.index[k]:
            if r.values[k] >=0.5:
                corr_features.append([i, r.index[k], r.values[k]])
        k += 1
corr_features

In [None]:
feat =[]
for i in corr_features:
    if i[2] >= 0.8:
        feat.append(i[0])
        feat.append(i[1])
        print(feat)
        
# since 'price_range' is our target variable so we will not drop this

In [None]:
fig = plt.figure(figsize = (10,5))

sns.heatmap(d1.corr(), annot = True, fmt = '0.2f',  cmap="rocket")

In [None]:
for col in d1.columns: 
    sns.boxplot(x=d1[col])
    print(col) 
    plt.show()  
##  boxplot in order to detect outliers

In [None]:
def boxoutlier(var):
    for x in var.columns :
        
        Q1=var[x].quantile(0.25)
        Q3=var[x].quantile(0.75)
        IQR=Q3-Q1
        Lower = Q1-(1.5*IQR)
        Upper = Q3+(1.5*IQR)

        var.loc[:,x]=np.where(var[x].values > Upper,Upper-1,var[x].values)
        var.loc[:,x]=np.where(var[x].values < Lower,Lower+1,var[x].values)

boxoutlier(d1)

## treatment of outlier

In [None]:
for col in d1.columns: 
    sns.boxplot(x=d1[col])
    print(col) 
    plt.show() 
## crosschecking for outliers

In [None]:
from sklearn.model_selection import train_test_split
## import train_test_split() in order to do splitting data

In [None]:
X=d1.iloc[:,:-1]
X.tail()
## set the features for the respective target

In [None]:
y=d1['price_range']
y.tail()
#tail() is used to fetch last 5 rows of the data
# target 

Splitting of Data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25  ,random_state=4)
# test_size parameter split the whole data into X train and X test
# random_state is used to freeze this spliting, it can be any numerical value

In [None]:
display(X_train.head(),y_train.head(),'Testing',X_test.head(),y_test.head())
# displaying heading rows of both data

In [None]:
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
clf=KNeighborsClassifier(n_neighbors=17,metric='manhattan')

# initiating function with no of neighbors and metric used for distance calculations
# commonl used values for metric {'manhattan','euclidean','hamming'}

In [None]:
clf.fit(X_train,y_train)
## Training the data

In [None]:
error_rate = []
for i in range(1,20):
    knn1 = KNeighborsClassifier(n_neighbors=i)
    knn1.fit(X_train,y_train)
    pred_i = knn1.predict(X_test)
    error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,20),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=5)   ## plot() used to plot the points on the canvas
plt.title('Error Rate vs. K Value')
plt.xlabel('K')  ## X axis ki labelling hogi
plt.ylabel('Error Rate')  ## Y-axis ki labelling hogi
y_pred = knn1.predict(X_test) ## predict() use
print(accuracy_score(y_test, y_pred)*100) ## checking accuracy score
y_pred  

# method to find value of k
# k=17 returns least error rate

In [None]:
pred_test=clf.predict(X_test)
## prediction data

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred_test)
# accuracy of testing model

In [None]:
pred_train=clf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,pred_train)

****# hence training accuracy is 93 and testing data accuracy is 95 .
**# model is neither overfit nor underfit**

# Evaluation Matrix

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,pred_test))

In [None]:
matrix=confusion_matrix(y_test,pred_test)
print(matrix)