In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')
test = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# After using 'head()' we can see, that both dataframes have 5 rows and 21 columns.

In [None]:
train.describe()

In [None]:
# Check have train data null or not

train.isna().sum()

In [None]:
# Check have test data null or not

test.isna().sum()

In [None]:
# As we can see, in our dataframes don't have some null, so we can continue.

In [None]:
# Check some info about train data

train.info()

In [None]:
# Let's a bit remake our train data

In [None]:
# I think, we can create one more column 'resolution' and remove px_height and px_width

In [None]:
train['resolution'] = train.px_height * train.px_width
train.drop(['px_height', 'px_width'], axis=1, inplace=True)

In [None]:
train.head().T

In [None]:
# Now we can make correlation matrix and see more details 

In [None]:
corr_matrix = train.corr()
fig, ax = plt.subplots(figsize=(15,10))
ax = sns.heatmap(corr_matrix,
                 annot=True,
                 linewidth=0.5,
                 fmt='.2f',
                 cmap='YlGnBu')

In [None]:
# Now we can check unique values of 'price_range'

train['price_range'].unique()

In [None]:
# Let's use some hist() for see more details

plt.hist(train['ram'])
plt.show()

In [None]:
train['ram'].median()

In [None]:
plt.hist(train['battery_power'])
plt.show()
train['battery_power'].median()

In [None]:
# Let's see count of price_range

train['price_range'].value_counts().plot(kind='bar', color=['red','blue', 'green', 'yellow'])

In [None]:
# Let's see count of dual-sim

train['dual_sim'].value_counts().plot(kind='bar', color=['green', 'yellow'])

In [None]:
# Now time for check some info about 3G and 4G
values3G = train['three_g'].value_counts().values

values4G = train['four_g'].value_counts().values

In [None]:
fig1, ax1 = plt.subplots()
colors= ['yellow','grey']
ax1.pie(values3G, labels=['3G-supported', 'Not supported'], autopct='%1.1f%%', shadow=True, startangle=90, colors=colors)
plt.show()

In [None]:
# As we can see, the most many phones with 3G support
# Let's check 4G phones with the same way

In [None]:
fig2, ax2 = plt.subplots()
colors = ['yellow', 'grey']
ax2.pie(values4G, labels=['4G-supported','Not supported'], autopct='%1.1f%%', shadow=True, startangle=90, colors=colors)
plt.show()

In [None]:
# Here is almost 50-50 situation.
# Now analyze about camera's. We'll check front cam. and primary cam.

In [None]:
plt.figure(figsize=(15,8))
train['fc'].hist(color='blue', label='Front camera')
train['pc'].hist(color='green', label='Primary camera')
plt.legend()
plt.xlabel('MegaPixels')

In [None]:
# Predicting modelling, but before, we must import all libraries which we'll use

# Models from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Model Evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [None]:
features = train.drop('price_range', axis=1)
target = train['price_range']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [None]:
# Logistic Regression

LogisticRegressionModel = LogisticRegression()
LogisticRegressionModel.fit(X_train, y_train)
LogisticRegressionModel.score(X_test, y_test)

In [None]:
# Linear Regression

LinearRegressionModel = LinearRegression()
LinearRegressionModel.fit(X_train, y_train)
LinearRegressionModel.score(X_test, y_test)

In [None]:
# KNN

KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
KNN.score(X_test, y_test)

In [None]:
# Random Forest Classifier

RandomForestClass = RandomForestClassifier()
RandomForestClass.fit(X_train, y_train)
RandomForestClass.score(X_test, y_test)

In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

DecisionTree = DecisionTreeClassifier()
DecisionTree.fit(X_train, y_train)
DecisionTree.score(X_test, y_test)

In [None]:
models = {
    'Logistic Regression' : LogisticRegressionModel.score(X_test, y_test),
    'Linear Regression' : LinearRegressionModel.score(X_test, y_test),
    'KNN' : KNN.score(X_test, y_test),
    'Random Forest Classifier' : RandomForestClass.score(X_test, y_test),
    'Decision Tree' : DecisionTree.score(X_test, y_test)
}

In [None]:
models

In [None]:
# Model comparison

In [None]:
model_compare = pd.DataFrame(models, index=['accuracy'])
model_compare.T.plot.bar();

In [None]:
# Hyperparameter tuning
# Let's start with KNN

In [None]:
train_scores = []
test_scores = []

neighbors = range(1,21)

knn = KNeighborsClassifier()

for i in neighbors:
    knn.set_params(n_neighbors = i)
    
    knn.fit(X_train, y_train)
    
    train_scores.append(knn.score(X_train, y_train))
    
    test_scores.append(knn.score(X_test, y_test))

In [None]:
train_scores

In [None]:
test_scores

In [None]:
plt.plot(neighbors, train_scores, label='train score')
plt.plot(neighbors, test_scores, label='test score')
plt.xticks(np.arange(1,21,1))
plt.xlabel('Number of neighbors')
plt.ylabel('Model score')
plt.legend()

print(f'Maximum  KNN score on the data: {max(test_scores) * 100:.2f}%')

In [None]:
# Hyperparameter tuning with RandomizedSearchCV
# We're going to tune:
# LogististicRegression()
# RandomForestClassifier() with using RandomizedSearchCV

In [None]:
log_reg_grid = {'C': np.logspace(-4,4,20),
                'solver': ['liblinear']}

rf_grid = {'n_estimators' : np.arange(10,1000,20),
           'max_depth' : [None, 3, 5, 10],
           'min_samples_split' : np.arange(1, 20, 2),
           'min_samples_leaf' : np.arange(1,20,2)}

In [None]:
# Tune LogisticRegression()

np.random.seed(42)

rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5, 
                                n_iter= 20,
                                verbose=True)

rs_log_reg.fit(X_train, y_train)

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
# Let's do the same for RandomForestClassifier

In [None]:
np.random.seed(42)

rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

rs_rf.fit(X_train, y_train)

In [None]:
rs_rf.best_params_

In [None]:
rs_rf.score(X_test, y_test)

In [None]:
models

In [None]:
y_preds = rs_rf.predict(X_test)

In [None]:
y_preds

In [None]:
y_test

In [None]:
print(classification_report(y_test, y_preds))