In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# LOADING OF LIBRARIES

In [None]:
import warnings  
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
data = pd.read_csv('../input/mobile-price-classification/train.csv')
data.head()

In [None]:
data.shape

# CORRELATION MATRIX

In [None]:
corrmat=data.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(20,20))
fig=sns.heatmap(data[top_corr_features].corr(),annot=True,
                cmap="Accent")

# UNIVARIATE ANALYSIS

In [None]:
continuous_features = ['battery_power', 'fc', 'clock_speed', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'ram', 'px_width', 'sc_h', 'sc_w', 'talk_time']
categorical_features = ['blue', 'dual_sim', 'four_g','three_g','touch_screen','wifi']

# CONTINUOUS VARIABLES

In [None]:
for i in continuous_features:
    fig = plt.figure(figsize=(6,5))
    sns.kdeplot(data=data, x=i, fill=True, common_norm=False, palette="crest",
   alpha=.5, color='red', linewidth=0)
    plt.xlabel(i)
    plt.show()

# ANALYSIS OF CATEGORICAL VARIABLES

In [None]:
for i in categorical_features:
    plt.figure(figsize=(6,5))
    sns.countplot(data[i])
    plt.xlabel(i)
    plt.show()

# CHECKING NULL VALUES

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

**BASIC STATISTICAL DESCRIPTION OF DATA**

In [None]:
data.describe()

In [None]:
sns.countplot(data['price_range'])

**The above plot shows that data does not contain imbalanced class**

In [None]:
data.columns

# RELATIONSHIP BETWEEN VARIABLES

In [None]:
sns.catplot(x='blue', y='battery_power', hue='price_range', kind = 'box', data = data)

**Mobile phones having bluetooth with high battery power costs high**

In [None]:
sns.catplot(x='wifi', y='int_memory', hue='price_range', kind = 'box', data = data)

In [None]:
sns.catplot(x='dual_sim', y='talk_time', hue='price_range', kind = 'box', data = data)

**Dual sim and more talktime cause rise in price range**

In [None]:
sns.catplot(x='four_g', y='int_memory', hue='price_range', kind = 'box', data = data)

In [None]:
sns.catplot(x='three_g', y='int_memory', hue='price_range', kind = 'box', data = data)

In [None]:
features = ['battery_power', 'fc', 'clock_speed', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'ram', 'px_width', 'sc_h', 'sc_w', 'talk_time','blue', 'dual_sim', 'four_g','three_g','touch_screen','wifi']
label = ['price_range']
X=data[features]
y=data[label]

**Splitting the dataset into train and test**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test= train_test_split(X,y,test_size=0.30, shuffle=True)

**Scaling of data is required as most of the variables are continuous and they lie in different ranges**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train=X_train_scaled
X_test=X_test_scaled

# BUILDING MODELS

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
log_reg = LogisticRegression(penalty='l2', fit_intercept=True)
log_reg.fit(X_train,y_train)

In [None]:
pred1 = log_reg.predict(X_test)

In [None]:
classification_report = classification_report(y_test, pred1)
print(classification_report)

In [None]:
print(confusion_matrix(y_test, pred1))

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
pred2 = rfc.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
report = classification_report(y_test, pred2)
print(report)

In [None]:
print(confusion_matrix(y_test, pred1))

**XGBoost Classifier**

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
pred3 = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
report = classification_report(y_test, pred3)
print(report)

In [None]:
print(confusion_matrix(y_test, pred1))

# Hyperparameter tuning of XGB using Hyperopt

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

**Defining Objective function**

In [None]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': accuracy, 'status': STATUS_OK }

**Optimization Algorithm**

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

**Printing results**

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)