In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_train = pd.read_csv('/kaggle/input/mobile-price-classification/train.csv')

In [None]:
pd.set_option('display.max_columns',50)
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.isnull().sum()

## Since the count of target variables are equal, we don't do Stratified splits

In [None]:
df_train.price_range.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## taking equal columns in 2 dataset to plot swarmplot

In [None]:
df_plot_1 = df_train[df_train.columns[0:10]].copy()
df_plot_2 = df_train[df_train.columns[10:21]].copy()

In [None]:
y = df_plot_2.iloc[:,-1:]
df_plot_2.drop('price_range',axis=1,inplace=True)
df_plot_1.head()


In [None]:
df_plot_2.head()

In [None]:
df_plot_2.shape

In [None]:
df_plot_1.shape

## Standarzation for features in the dataset 

In [None]:
data_std_1 = (df_plot_1 - df_plot_1.mean()) / (df_plot_1.std())              # standardization
data_std_2 = (df_plot_2 - df_plot_2.mean()) / (df_plot_2.std())              # standardization

data_std_1['price_range'] = y
data_std_2['price_range'] = y


In [None]:
data_std_1

In [None]:
data_std_2

## Pairplot for all the features in the dataset for better understanding of how all features line up for for different price_ranges

In [None]:
g = sns.pairplot(df_train, hue= 'price_range', height=2)
g.map_lower(sns.kdeplot, levels=4, color=".2")

In [None]:
df_train.columns

## After Analysing the Pairplot, plotting only columns with clear sepration of different target classes(price_range)

In [None]:
g = sns.pairplot(data=df_train, hue= 'price_range', height=2.5,aspect=.5, y_vars = ['ram'],x_vars=['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'])


In [None]:
sns.set(style="dark", palette="bright")
data_1 = pd.melt(data_std_2,id_vars="price_range",
                    var_name="features",
                    value_name='value')
data_1
plt.figure(figsize=(12,12))
sns.swarmplot(x="features", y="value", hue="price_range", data=data_1)

plt.xticks(rotation=90)

In [None]:
sns.set(style="dark", palette="bright")
data_2 = pd.melt(data_std_1,id_vars="price_range",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(12,12))
sns.swarmplot(x="features", y="value", hue="price_range", data=data_2)

plt.xticks(rotation=90)

## As visible from Swarmplot, RAM is indeed very important feature for pricing 

## Plotting heatmap of Correlation between features

In [None]:
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(df_train.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

## Some more visualization of Battery power, RAM, px_height and width

In [None]:
# plt.subplots(figsize=(18,12))
sns.set(rc={'figure.figsize':(18,12)})

sns.displot(df_train, x="battery_power",hue="price_range", kind="kde",col='price_range')
sns.displot(df_train, x="ram",hue="price_range", kind="kde",col='price_range')
sns.displot(df_train, x="px_width",hue="price_range", kind="kde",col='price_range')
sns.displot(df_train, x="px_height",hue="price_range", kind="kde",col='price_range')



## Checking out cross validation score with LogisticRegression,DecisionTree,Randomforest and Support Vector Classifier for Multiclass classification using pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,f1_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV



In [None]:
DT_pipeline = Pipeline(steps = [('scale',StandardScaler()),('DT',DecisionTreeClassifier(random_state=42))])
RF_pipeline = Pipeline(steps = [('scale',StandardScaler()),('DT',RandomForestClassifier(random_state=42))])
SVM_pipeline = Pipeline(steps = [('scale',StandardScaler()),('DT',SVC(random_state=42))])
LR_pipeline = Pipeline(steps = [('scale',StandardScaler()),('DT',LogisticRegression(random_state=42))])

X = df_train.iloc[:,:-1]
Y = df_train.iloc[:,-1]

In [None]:
DT_CROSS_VAL = cross_val_score(DT_pipeline,X,Y,cv=10)
RF_CROSS_VAL = cross_val_score(RF_pipeline,X,Y,cv=10)
SVM_CROSS_VAL = cross_val_score(SVM_pipeline,X,Y,cv=10)
LR_CROSS_VAL = cross_val_score(LR_pipeline,X,Y,cv=10)

In [None]:
RF_CROSS_VAL, SVM_CROSS_VAL, LR_CROSS_VAL, DT_CROSS_VAL

In [None]:
import plotly.express as ex
import plotly.graph_objs as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
fig = make_subplots(rows=4, cols=1,shared_xaxes=True,subplot_titles=('Decision Tree Cross Val Scores',
                                                                     'RandomForest Cross Val Scores',
                                                                    'SVM Cross Val Scores','Logistic Regression Cross Val Scores'))

fig.add_trace(
    go.Scatter(x=list(range(0,len(DT_CROSS_VAL))),y=DT_CROSS_VAL,name='Decision Tree'),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(DT_CROSS_VAL))),y=RF_CROSS_VAL,name='RandomForest'),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(DT_CROSS_VAL))),y=SVM_CROSS_VAL,name='SVM'),
    row=3, col=1
)
fig.add_trace(
    go.Scatter(x=list(range(0,len(DT_CROSS_VAL))),y=LR_CROSS_VAL,name='Logistic Regression'),
    row=4, col=1
)

fig.update_layout(height=700, width=900, title_text="Different Model 5 Fold Cross Validation")
fig.update_yaxes(title_text="F1 Score")
fig.update_xaxes(title_text="Fold #")

fig.show()

## confusion matrix for Logistic regression (Multiclass)

In [None]:
from sklearn.model_selection import cross_val_predict
f,ax = plt.subplots(figsize=(18, 18))
y_train_pred = cross_val_predict(LR_pipeline, X, Y, cv=3)
conf_mx = confusion_matrix(Y, y_train_pred)
sns.heatmap(conf_mx, annot=True, linewidths=.5, fmt= '.1f',ax=ax)

## confusion matrix for RandomForest (Multiclass)

In [None]:
from sklearn.model_selection import cross_val_predict
f,ax = plt.subplots(figsize=(18, 18))
y_train_pred = cross_val_predict(RF_pipeline, X, Y, cv=3)
conf_mx = confusion_matrix(Y, y_train_pred)
sns.heatmap(conf_mx, annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [None]:
DT_pipeline.fit(X_train,y_train)
RF_pipeline.fit(X_train,y_train)
SVM_pipeline.fit(X_train,y_train)
LR_pipeline.fit(X_train,y_train)


DT_PRED   = DT_pipeline.predict(X_test)
RF_PRED   =RF_pipeline.predict(X_test)
SVM_PRED  = SVM_pipeline.predict(X_test)
LR_PRED   = LR_pipeline.predict(X_test)

DT_CM  = confusion_matrix(y_test,DT_PRED )
RF_CM  = confusion_matrix(y_test,RF_PRED )
SVM_CM = confusion_matrix(y_test,SVM_PRED)
LR_CM  = confusion_matrix(y_test,LR_PRED )

DT_F1  = f1_score(y_test,DT_PRED,average='weighted' )
RF_F1  = f1_score(y_test,RF_PRED,average='weighted' )
SVM_F1 = f1_score(y_test,SVM_PRED,average='weighted')
LR_F1  = f1_score(y_test,LR_PRED,average='weighted' )

## weighted f1 score on test data

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=['Decision Tree','Random Forest','SVM','Logistic Regression'],y=[DT_F1,RF_F1,SVM_F1,LR_F1]))
fig.update_layout(title='F1 Score Of Our Model On Original Data',xaxis_title='Model',yaxis_title='F1 Score')
fig.show()

## Important features using RandomForest

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=X_train.columns,y=RF_pipeline['DT'].feature_importances_))
fig.update_layout(title='The Importance Of The Original Attributes On Our Prediction',xaxis_title='Model',yaxis_title='F1 Score')
fig.show()

In [None]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(X_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_train.columns[rfecv.support_])

## Using GridSearchCV for logistic regression since f1 scoring was maximum 

In [None]:
# Find best hyperparameters (roc_auc)
from sklearn.model_selection import GridSearchCV
log_clf = LogisticRegression(random_state = 42)
param_grid = {'class_weight' : ['balanced', None], 
                'penalty' : ['l2','l1'],  
                'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid = GridSearchCV(estimator = log_clf, param_grid = param_grid , scoring = 'roc_auc', verbose = 1, n_jobs = -1)

grid.fit(X,Y)

print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))

best_parameters = grid.best_params_

In [None]:
scaler = StandardScaler()
X_scaled =scaler.fit_transform(X)

In [None]:
log_clf = LogisticRegression(**best_parameters)
log_clf.fit(X_scaled,Y)


In [None]:
y_pred   = log_clf.predict(X_test)


In [None]:
x_test_data = pd.read_csv('/kaggle/input/mobile-price-classification/test.csv')
x_test_data.drop('id',axis=1,inplace=True)
scaling_test = StandardScaler()
x_test_scaled =scaling_test.fit_transform(x_test_data)

In [None]:
y_prediction= log_clf.predict(x_test_scaled)

In [None]:
unique, counts = np.unique(y_prediction, return_counts=True)
dict(zip(unique, counts))