# Importting neccessary liberaries 

In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from IPython.display import Image
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_tree, plot_importance
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
import warnings
warnings.filterwarnings("ignore")


# Loadind data and wrangling
 

In [None]:
df_train = pd.read_csv("../input/mobile-price-classification/train.csv", header=0)
df_test = pd.read_csv("../input/mobile-price-classification/train.csv", header=0)

In [None]:
df_test.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.isna().sum()

In [None]:
print('train data duplicates = {}'.format(df_train.duplicated().sum()))

## The data is very good. Has no null values or duplicated values

# Corelation

In [None]:
matrix = np.triu(df_train.corr())
sns.set_style("white")
f,ax=plt.subplots(figsize = (16,16))
sns.heatmap(df_train.corr(),annot= True,fmt = ".2f",ax=ax,
            vmin = -1,
            vmax = 1, mask = matrix,cmap = "coolwarm",
            linewidth = 0.2,linecolor = "white")
plt.xticks(rotation=70)
plt.yticks(rotation=0)
plt.title('Correlation Map', size = 14)
plt.show()

*corelation with targets:*
>* `ram` is highly corelated with `price_range`
* there is corelation between `price_range` and `battery_power`, `px_height` and `px_width` 

*corelation between features:*
>* `three_g` corelated with `four_g`
* `sc_width` corelated with `sc_height`
* `px_width` corelated with `px_height`
* *front camera* corelated with *back camera*



# Reducing dependances

In [None]:
mydf_train = df_train.copy()
mydf_test = df_test.copy()

In [None]:
mydf_train['diag_px'] = np.sqrt(mydf_train.px_height**2 + mydf_train.px_width**2)
mydf_train['diag_sc'] = np.sqrt(mydf_train.sc_h**2 +mydf_train.sc_w**2)

mydf_test['diag_px'] = np.sqrt(mydf_test.px_height**2 + mydf_test.px_width**2)
mydf_test['diag_sc'] = np.sqrt(mydf_test.sc_h**2 + mydf_test.sc_w**2)


In [None]:
mydf_train.drop(['px_height', 'px_width','sc_h', 'sc_w'],axis=1, inplace=True)
mydf_test.drop(['px_height', 'px_width','sc_h', 'sc_w'],axis=1, inplace=True)

In [None]:
matrix = np.triu(mydf_train.corr())
sns.set_style("white")
f,ax=plt.subplots(figsize = (16,16))
sns.heatmap(mydf_train.corr(),annot= True,fmt = ".2f",ax=ax,
            vmin = -1,
            vmax = 1, mask = matrix,cmap = "coolwarm",
            linewidth = 0.2,linecolor = "white")
plt.xticks(rotation=70)
plt.yticks(rotation=0)
plt.title('Correlation Map', size = 14)
plt.show()

## **`diag_px` and `diag_sc` added to the data**
According to [relation](https://www.omnicalculator.com/math/diagonal-of-rectangle)



In [None]:
#data_train.profile_report()

# Cleaning the data

In [None]:
ax = sns.boxplot(x="price_range",
            y="ram",
            data=df_train,
            palette="Set3",
            fliersize=5)

found here that there are outliers in ram with price ranges so, i will try handel them by transformation.

In [None]:
df_train['lnram'] = np.log(df_train.ram)
df_train['regularized'] = (df_train.ram - np.mean(df_train.ram))/ (max(df_train.ram)-min(df_train.ram))
df_train['standard'] = (df_train.ram - np.mean(df_train.ram))/ np.std(df_train.ram)

In [None]:
fig, (ax1, ax2, ax3,ax4) = plt.subplots(4,1,figsize=(10,10))
sns.boxplot(x="price_range",
            y="ram",
            data=df_train,
            palette="Set3",
            fliersize=5,
            ax=ax1)
sns.boxplot(x="price_range",
            y="lnram",
            data=df_train,
            palette="Set3",
            fliersize=5,
            ax=ax2)
sns.boxplot(x="price_range",
            y="regularized",
            data=df_train,
            palette="Set3",
            fliersize=5,
            ax=ax3)
sns.boxplot(x="price_range",
            y="standard",
            data=df_train,
            palette="Set3",
            fliersize=5,
            ax=ax4)

transformaton didn't handeled the data but, I will choose standarradization. seems good enough for me to reduce variation

In [None]:
df_train.drop(labels=["lnram", "regularized", 'standard'],axis=1, inplace=True)

In [None]:
mydf_train['ram_stand'] = (mydf_train.ram - np.mean(mydf_train.ram))/ np.std(mydf_train.ram)
mydf_test['ram_stand'] =  (mydf_test.ram - np.mean(mydf_test.ram))/ np.std(mydf_test.ram)

# visualization

In [None]:
sns.displot(df_train, x="ram",
            hue="price_range",
            kind="kde", 
            fill=True,
            palette="tab10",
            ax=ax);

In [None]:
fig, ax = plt.subplots()
sns.boxplot(x="price_range",
                 y="battery_power",
                 data=df_train,
                 palette="Set3",
                 fliersize=5,
            ax=ax);
fig.set_size_inches(10,10)

In [None]:
fig, ax = plt.subplots()
sns.boxplot(x="price_range",
                 y=np.log(df_train.battery_power),
                 data=df_train,
                 palette="Set3",
                 fliersize=5);
fig.set_size_inches(10,10)

In [None]:
sns.displot(df_train, x='battery_power',
            hue="price_range",
            kind="kde", 
            fill=True,
            palette="tab10");


In [None]:
fig, ax = plt.subplots()
sns.scatterplot(data=df_train,x='battery_power', y='ram',
                hue='price_range',
                palette="dark",
                alpha=0.65,
                ax=ax);
fig.set_size_inches(8,8)

In [None]:
corr = mydf_train.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 15))
    ax = sns.heatmap(corr,
                     fmt = ".2f",
                     mask=mask,
                     cmap="YlGnBu",
                     ax=ax,
                     annot=True,
                     vmin=-1,vmax=1,
                     linecolor = "white",
                     linewidth = 0.2,
                     #center=0
                     )

In [None]:
corr = df_train.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(15, 15))
    ax = sns.heatmap(corr,
                     fmt = ".2f",
                     mask=mask,
                     cmap="YlGnBu",
                     ax=ax,
                     annot=True,
                     vmin=-1,vmax=1,
                     linecolor = "white",
                     linewidth = 0.2,
                     #center=0
                     )

In [None]:
ax = sns.barplot(x="price_range", y="three_g", data=df_train)

In [None]:
ax = sns.barplot(x="price_range", y="four_g", data=df_train)

In [None]:
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="price_range", y="ram",hue='n_cores', data=df_train,
            palette="dark",
            alpha=.7)

In [None]:
df_train.columns

In [None]:
x = df_train.drop("price_range", axis=1)
y = df_train.price_range

X_train, X_test_1, Y_train, Y_test_1 = train_test_split(x, y, test_size=0.2, random_state=42) 
X_valid, X_test, Y_valid, Y_test = train_test_split(X_test_1, Y_test_1, test_size=0.25, random_state=42)

In [None]:
random_state = 42

#X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)

In [None]:
print(X_train.shape,X_valid.shape,X_test.shape)
print("\n")
print(Y_train.shape,Y_valid.shape,Y_test.shape)

In [None]:
Strander =StandardScaler() 
X_train = Strander.fit_transform(X_train)
X_valid = Strander.fit_transform(X_valid)
X_test = Strander.transform(X_test)

In [None]:
models = {
    'GaussianNB': GaussianNB(),
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'SupportVectorMachineLinear': SVC(kernel='linear',gamma='auto',random_state=42),
    'SupportVectorMachineRbf': SVC(kernel='rbf',gamma='auto',random_state=42),
    'MultiLinearPrecptron': MLPClassifier(random_state=42), 
    'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)}

In [None]:
modelNames = ['GaussianNB',
              'LogisticRegression',
              'RandomForestClassifier',
              'SupportVectorMachineLinear',
              'SupportVectorMachineRbf',
              'MultiLinearPrecptron',
              'DecisionTreeClassifier',
              'KNeighborsClassifier',
              'GradientBoostingClassifier'
              ]
cv_results_acc = []
trainScores = []
validationScores = []
testScores = []
best_estimators = []

for each in models:
      '''
      model = GridSearchCV(models[each],
                       param_grid=classifier_param[each],
                       cv = StratifiedKFold(n_splits = 10),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2,
                       );
                       '''
      model = models[each]
      model.fit(X_train, Y_train)
      print("Model: {}".format(each))
      #print("Best Estimator: {}".format(model.best_estimator_))
      #print('{}'.format(modelNames[each])) 
      train_score = model.score(X_train,Y_train)
      print('Train score of trained model: {}'.format(train_score*100))
      trainScores.append(train_score*100)

      validation_score = model.score(X_valid, Y_valid)
      print('Validation score of trained model: {}'.format(validation_score*100))
      validationScores.append(validation_score*100)

      test_score = model.score(X_test, Y_test)
      print('Test score of trained model: {}'.format(test_score*100))
      testScores.append(test_score*100)
      print(" ")
        
      y_predictions = model.predict(X_test)
      conf_matrix = confusion_matrix(y_predictions, Y_test)

      print('Confussion Matrix: \n{}\n'.format(conf_matrix))

      predictions = model.predict(X_test)
      cm = confusion_matrix(predictions, Y_test)

      

      print("") 
      print('Classification Report: \n{}\n'.format(classification_report(predictions, Y_test)))
      print("")

      for i in range(1):
        current = modelNames[i]
        modelNames.remove(modelNames[i])
        cv_score = cross_val_score(model, X_train, Y_train,scoring="accuracy", cv=10)
        cv_results_acc.append(cv_score.mean()*100)
        print("Cross Validation Accuracy: {}:{}".format(current, cv_score.mean()))

        preds = model.predict(X_test)
        confusion_matr = confusion_matrix(Y_test, preds) #normalize = 'true'
        print("===================================================================================")
        print("")
        print("")
        print("")

In [None]:
models_results = {"Test_Accuracy":testScores,
                               "Cross_Validation_Accuracy": cv_results_acc,
                               "Models":['GaussianNB',
                                          'LogisticRegression',
                                          'RandomForestClassifier',
                                          'SupportVectorMachineLinear',
                                          'SupportVectorMachineRbf',
                                          'MultiLinearPrecptron',
                                          'DecisionTreeClassifier',
                                          'KNeighborsClassifier',
                                          'GradientBoostingClassifier'
                                          ]}
models_results = pd.DataFrame(models_results, columns= ['Models','Test_Accuracy', 'Cross_Validation_Accuracy'])


In [None]:
models_results = models_results.sort_values("Cross_Validation_Accuracy",ascending=False)
fig, ax = plt.subplots()                
sns.barplot(y=models_results.Models, x= models_results.Cross_Validation_Accuracy,
            ax = ax,
            palette="dark",
            alpha=.8)
plt.xlabel("Mean Accuracy")
plt.title("Cross Validation Scores")
fig.set_size_inches(8,8)

##I will choose te first 5 algorithmes to get the best models of all with different hyperparameters

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

Strander =StandardScaler() 
X_train = Strander.fit_transform(X_train)
X_test = Strander.transform(X_test)

In [None]:
max_itr = 200
models_best = [LogisticRegression(random_state=42,max_iter= max_itr),
              RandomForestClassifier(random_state=4),
              SVC(random_state=42,probability=True),
              MLPClassifier(random_state=42),
              GradientBoostingClassifier(random_state=42)]

In [None]:
svc_param_grid = {"kernel" : ["rbf", "linear"],
                 "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [0.1, 1,10,50,100,200,300,1000]}

rf_param_grid = {"max_features": [1,3,10],
                "min_samples_split":[2,3,10],
                "min_samples_leaf":[1,3,10],
                "bootstrap":[False],
                "n_estimators":[100,300],
                "criterion":["gini"]}

logreg_param_grid = {"C":np.logspace(-4, 4, 20),
                    "penalty": ["l1","l2","none"]}



gbc_param_grid = {"learning_rate": [0.05, 0.1, 0.2],
                  "min_samples_split": [2,3,10],
                  "min_samples_leaf": [1,3,10]}

MLP_pram_grid = {"activation":["logistic","relu"],
              "solver":["adam","sgd"]}


classifier_param = [logreg_param_grid,
                    rf_param_grid,
                    svc_param_grid,
                    MLP_pram_grid,
                    gbc_param_grid]

In [None]:
models_best_Names = [
                     'LogisticRegression',
                     'RandomForestClassifier',
                     'SupportVectorMachine',
                     'MultiLinearPrecptron',
                     'GradientBoostingClassifier'
                      ]

In [None]:
models_best_Names = [
                     'LogisticRegression',
                     'RandomForestClassifier',
                     'SupportVectorMachine',
                     'MultiLinearPrecptron',
                     'GradientBoostingClassifier'
                      ]

cv_result = []
best_estimators = []
mean_squared_errors = []
roc_auc_scores = []
recall_scores = []
precision_scores = []
f1_scores = []

for i in range(len(models_best)):
    print("---------------------------------------------------------------------------")
    model = GridSearchCV(models_best[i],
                       param_grid=classifier_param[i],
                       cv = StratifiedKFold(n_splits = 10),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2)
    
    model.fit(X_train,Y_train)
    
    cv_result.append(model.best_score_)
    
    mean_squared_errors.append(mean_squared_error(Y_test,model.predict(X_test)))

    recall_scores.append(recall_score(Y_test, model.predict(X_test), average='weighted'))
    
    precision_scores.append(precision_score(Y_test, model.predict(X_test), average='weighted'))
    f1_scores.append(f1_score(Y_test, model.predict(X_test), average='weighted'))
    
    best_estimators.append(model.best_estimator_)
    print("")
    print("")
    print("Model: {} \n".format(models_best_Names[i]))
    print("Accuracy: %{} ".format(round(cv_result[i]*100,2)))
    print("MSE: {} ".format(mean_squared_errors[i]))
    print("Recall: {} ".format(recall_scores[i]))
    print("Precision: {} ".format(precision_scores[i]))
    print("F1-Score: {} \n".format(f1_scores[i]))
    #print("Best Estimator: {} ".format(model.best_estimator_))

    test_score = model.score(X_test, Y_test)
    print('Test score of trained model: {}'.format(test_score*100))
    testScores.append(test_score*100)
    print(" ")
      
    y_predictions = model.predict(X_test)
    conf_matrix = confusion_matrix(y_predictions, Y_test)

    print('Confussion Matrix: \n{}\n'.format(conf_matrix))

    predictions = model.predict(X_test)
    cm = confusion_matrix(predictions, Y_test)



    

    print("") 
    print('Classification Report: \n{}\n'.format(classification_report(predictions, Y_test)))
    print("")

    print("Best Estimator: {} \n".format(model.best_estimator_))    
print("---------------------------------------------------------------------------")

In [None]:
models_results = {"Test_Accuracy":test_score,
                  "Cross_Validation_Accuracy": cv_result,
                  "Models":[
                          'LogisticRegression',
                          'RandomForestClassifier',
                          'SupportVectorMachine',
                          'MultiLinearPrecptron',
                          'GradientBoostingClassifier'
                            ]}
models_results = pd.DataFrame(models_results, columns= ['Models','Test_Accuracy', 'Cross_Validation_Accuracy'])


In [None]:
models_results = models_results.sort_values("Cross_Validation_Accuracy",ascending=False)
fig, ax = plt.subplots()                
sns.barplot(y=models_results.Models, x= models_results.Cross_Validation_Accuracy,
            ax = ax,
            palette="dark",
            alpha=.8)
plt.xlabel("Mean Accuracy")
plt.title("Cross Validation Scores")
fig.set_size_inches(8,8)

## Results for the best Algorithem

In [None]:
best_alg = best_estimators[0]
print("Best Algorithem: {} \n".format(best_alg)) 
print("Accuracy: %{} ".format(round(cv_result[0]*100,2)))
y_predictions = best_alg.predict(X_test)
conf_matrix = confusion_matrix(y_predictions, Y_test)

print('Confussion Matrix: \n{}\n'.format(conf_matrix))

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, Y_test)





print("") 
print('Classification Report: \n{}\n'.format(classification_report(predictions, Y_test)))

## See my solution in feature reduction

In [None]:
x = mydf_train.drop("price_range", axis=1)
y = mydf_train.price_range
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42) 

Strander =StandardScaler() 
X_train = Strander.fit_transform(X_train)
X_test = Strander.transform(X_test)

In [None]:
models_best_Names = [
                     'LogisticRegression',
                     'RandomForestClassifier',
                     'SupportVectorMachine',
                     'MultiLinearPrecptron',
                     'GradientBoostingClassifier'
                      ]

cv_result = []
best_estimators = []
testScores = []

for i in range(len(models_best)):
    print("---------------------------------------------------------------------------")
    model = GridSearchCV(models_best[i],
                       param_grid=classifier_param[i],
                       cv = StratifiedKFold(n_splits = 10),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2)
    
    model.fit(X_train,Y_train)
    
    cv_result.append(model.best_score_)
    
    mean_squared_errors.append(mean_squared_error(Y_test,model.predict(X_test)))

    best_estimators.append(model.best_estimator_)
    print("")
    print("")
    print("Model: {} \n".format(models_best_Names[i]))
    print("Accuracy: %{} ".format(round(cv_result[i]*100,2)))
    print("MSE: {}\n ".format(mean_squared_errors[i]))
    #print("Best Estimator: {} ".format(model.best_estimator_))

    test_score = model.score(X_test, Y_test)
    print('Test score of trained model: {}'.format(test_score*100))
    testScores.append(test_score*100)
    print(" ")
      
    y_predictions = model.predict(X_test)
    conf_matrix = confusion_matrix(y_predictions, Y_test)

    print('Confussion Matrix: \n{}\n'.format(conf_matrix))

    predictions = model.predict(X_test)
    cm = confusion_matrix(predictions, Y_test)



    

    print("") 
    print('Classification Report: \n{}\n'.format(classification_report(predictions, Y_test)))
    print("")

    print("Best Estimator: {} \n".format(model.best_estimator_))    
print("---------------------------------------------------------------------------")

In [None]:
models_results = {"Test_Accuracy":testScores,
                  "Cross_Validation_Accuracy": cv_result,
                  "Models":[
                          'LogisticRegression',
                          'RandomForestClassifier',
                          'SupportVectorMachine',
                          'MultiLinearPrecptron',
                          'GradientBoostingClassifier'
                            ]}
models_results = pd.DataFrame(models_results, columns= ['Models','Test_Accuracy', 'Cross_Validation_Accuracy'])
models_results = models_results.sort_values("Cross_Validation_Accuracy",ascending=False)
fig, ax = plt.subplots()                
sns.barplot(y=models_results.Models, x= models_results.Cross_Validation_Accuracy,
            ax = ax,
            palette="dark",
            alpha=.8)
plt.xlabel("Mean Accuracy")
plt.title("Cross Validation Scores")
fig.set_size_inches(8,8)

In [None]:
best_alg = best_estimators[0]
print("Best Algorithem: {} \n".format(best_alg)) 
print("Accuracy: %{} ".format(round(cv_result[0]*100,2)))
y_predictions = best_alg.predict(X_test)
conf_matrix = confusion_matrix(y_predictions, Y_test)

print('Confussion Matrix: \n{}\n'.format(conf_matrix))

predictions = model.predict(X_test)
cm = confusion_matrix(predictions, Y_test)





print("") 
print('Classification Report: \n{}\n'.format(classification_report(predictions, Y_test)))

# colclusion
>* Logistic Regression is the best model for the data and its best estimator is
` LogisticRegression(C=545.5594781168514, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=200, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) `
>* Best split of the data is to take 80% from the data to train set and 20% of the data for testing set without taking validation set and make it inside the training
>* Reducing features by replace mobile's hieght and wiedth with diameter and also for screen is not effective and reduce the accuracy of the model but reducing also the variation between models in cllassification.  
>* Making standardization scaling to the data is significantly effective for increasing model classification