# Hello everyone! 
**In this notebook we will analyze dataset about Smartphones features and prices. Let`s start.**

# 1) Import data and libraries.
**Firstly, import all useful libraries. Secondly, load data.**

In [None]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# load data (for exploring and cleaning we load test and train datasets)
data_train = pd.read_csv("/kaggle/input/mobile-price-classification/train.csv")

data_test = pd.read_csv("/kaggle/input/mobile-price-classification/test.csv")

# 2) Fast looking on data
**Let`s see head of our data frames, list of columns, sizes, descriptions and nan/null values in these datasets.**

In [None]:
# fast looking (size of dataframe, columns, 5 first rows of data, info and describing)
print(f"The train dataset has {data_train.shape[0]} rows.")
print(f"And {data_train.shape[1]} columns atleast")
print('-' * 50)
print(f"The test dataset has {data_test.shape[0]} rows.")
print(f"And {data_test.shape[1]} columns.")
print('-' * 50)
print(f"List of train dataset columns: {data_train.columns}")
print('-' * 50)
print(f"List of test dataset columns: {data_test.columns}")

print("-" * 50)
data_train.info()
print('-' * 50)
data_test.info()

**Information about columns:**
* id: ID
* battery_power: Total energy a battery can store in one time (mAh)
* blue: Support bluetooth or not
* clock_speed: Speed at which microprocessor executes instructions
* dual_sim: Support dual sim or not
* fc: Front Camera mega pixels
* four_g: Support 4G or not
* int_memory: Internal Memory (GB)
* m_dep: Mobile Depth (cm)
* mobile_wt: Weight of mobile phone
* n_cores: Number of cores of processor
* pc: Primary Camera mega pixels
* px_height: Pixel Resolution Height
* px_width: Pixel Resolution Width
* ram: Random Access Memory (MB)
* sc_h: Screen Height of mobile (cm)
* sc_w: Screen Width of mobile (cm)
* talk_time: Time that a single battery charge will last
* three_g: Support 3G or not
* touch_screen: Has touch screen or not
* wifi: Support wifi or not

In [None]:
data_train.head()

In [None]:
data_train.describe()

**Null and NAN values.**

In [None]:
procent_of_null = data_train.isnull().sum() / data_train.shape[0]
print(procent_of_null)
print("-" * 20)
procent_of_nan = data_train.isna().sum() / data_train.shape[0]
print(procent_of_nan)

In [None]:
procent_of_null = data_test.isnull().sum() / data_test.shape[0]
print(procent_of_null)
print("-" * 20)
procent_of_nan = data_test.isna().sum() / data_test.shape[0]
print(procent_of_nan)

**How we can see train dataset includes 2000 rows and 21 columns, but test dataset includes 1000 rows and 21. We can look on names of columns and understand what they mean. Also, fortunately, we can see that there are not nan and null values.**

# 3) Cleaning
**Firstly, before modelling, we have to delete unnecessary columns to prevent overfitting, but to learn which columns we should drop we must create correlation matrix (you can check my interesting notebook about correlation here: https://www.kaggle.com/artemborzenko/calculating-the-correlation-of-a-youtube-dataset).**

**Let`s create this.**  

In [None]:
plt.figure(figsize=(20, 8))

correlation_rate = data_train.corr()

sns.heatmap(correlation_rate, annot = True, cmap = "Greens")

**Here we can see that there is strong positive correlation between price range and RAM. Also, price range, battery power and 4G/3G. Other features have small positive correlation, that’s why we will try work with every column.**

# 4) Exploring
**We will explore columns of TRAIN dataset about RAM, 4G/3G and Battery Power, because these columns are important for our future modeling.**

# *4.1) RAM*

In [None]:
print(f"Max RAM value is: {data_train['ram'].max()} MB")
print(f"Min RAM value is: {data_train['ram'].min()} MB")
print(f"Mean of RAM values is: {round(data_train['ram'].mean())} MB")

In [None]:
plt.figure(figsize=[8, 6])

sns.boxplot(data_train['ram'], linewidth=2.5)
plt.xlabel("RAM value")

In [None]:
cheap = data_train["ram"][data_train["price_range"] == 1]

medium = data_train["ram"][data_train["price_range"] == 2]

expensive = data_train["ram"][data_train["price_range"] == 3]

price_ram_data = pd.DataFrame({"Low Price" : cheap,
                              "Medium Price" : medium,
                              "High Price" : expensive})

In [None]:
plt.figure(figsize=[8, 6])

sns.boxplot(data = price_ram_data, linewidth=2.5)
plt.ylabel("RAM")

**In these boxplots we can see:**

**1)	In the first graph we can see maximum, minimum, median and mean of whole RAM column.**

**2)	In the second graph we can see comparison of price ranges and RAM amount in smartphones of these ranges.**

# *4.2) Battery power*

In [None]:
print(f"Max Battery Power value is: {data_train['battery_power'].max()} mAh")
print(f"Min Battery Power value is: {data_train['battery_power'].min()} mAh")
print(f"Mean of Battery Power values is: {round(data_train['battery_power'].mean())} mAh")

In [None]:
plt.figure(figsize=[8, 6])

sns.boxplot(data_train['battery_power'], linewidth=2.5)
plt.xlabel("Battery Power value")

In [None]:
cheap = data_train["battery_power"][data_train["price_range"] == 1]

medium = data_train["battery_power"][data_train["price_range"] == 2]

expensive = data_train["battery_power"][data_train["price_range"] == 3]

price_bp_data = pd.DataFrame({"Low Price" : cheap,
                              "Medium Price" : medium,
                              "High Price" : expensive})

In [None]:
plt.figure(figsize=[8, 6])

sns.boxplot(data = price_ram_data, linewidth=2.5)
plt.ylabel("RAM")

**In these boxplots we can see that:**

**1)	In the first graph we can see maximum, minimum, median and mean of whole Battery Power column.**

**2)	In the second graph we can see comparison of price ranges and Battery Power amount in smartphones of these ranges. It is very interesting, because Battery Power values is almost similar in every price range.**

# *4.3) 3G/4G*

In [None]:
three_g = data_train["three_g"].value_counts().values

labels = ["3G Supported", "Not Supported"]

In [None]:
plt.figure(figsize=[8, 6])
plt.pie(three_g, labels = labels, shadow=True, startangle=90)
plt.title("Is 3G supported?")

In [None]:
four_g = data_train["four_g"].value_counts().values

labels = ["4G Supported", "Not Supported"]

In [None]:
plt.figure(figsize=[8, 6])
plt.pie(four_g, labels = labels, shadow=True, startangle=90)
plt.title("Is 4G supported?")

**In these pie plots we can see that:**

**1)	More than a 75% of smartphones support 3G, but only 50% of smartphones support 4G.**

# 5) Modelling

**Before modelling we have to prepare data. Let’s do this:**

In [None]:
X = data_train.drop(["price_range"], axis = 1)
Y = data_train["price_range"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 25)

**Well, we can start modelling. Our task is to classify price range, that’s why we have to create classification models.**

# *Logistic Regression*

In [None]:
log_reg = LogisticRegression()

log_reg.fit(X_train, Y_train)
print(f"Score is {log_reg.score(X_test, Y_test)}")

**Very bad result, continue.**

# *Decision Tree*

In [None]:
tree = DecisionTreeClassifier(max_depth = 9)

tree.fit(X_train, Y_train)
print(f"Score is {tree.score(X_test, Y_test)}")

**Not bad, but we can try with other models.**

# *Random Forest Classifier*

In [None]:
forest = RandomForestClassifier(n_estimators = 300)

forest.fit(X_train, Y_train)
print(f"Score is {forest.score(X_test, Y_test)}")

**Good result, but what about KNN?**

# *KNN*

In [None]:
knn = KNeighborsClassifier(n_neighbors = 15)

knn.fit(X_train, Y_train)
print(f"Score is {knn.score(X_test, Y_test)}")

**Great result. Let`s make a classification report.**

In [None]:
prediction = knn.predict(X_test)

print(classification_report(Y_test, prediction))

# 6) Conclusion.

In [None]:
prediction_price = knn.predict(data_test.drop(["id"], axis = 1))

prepared_data = data_test
prepared_data["Predicted Price"] = prediction_price

prepared_data.head(5)

**Finally, we can see new dataset with our prediction values by KNN.**

**Thank you everyone who check this notebook. If you like my notebook upvote it and if you dislike, please, write your comments. All of your feedback will help me to improve my skills.**