# This notebook for preprocessing and machine-learning model of the Devices-Price-Classification-System project

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## importing libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
import random
import joblib
import pickle
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.impute import KNNImputer
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer

In [3]:
# fix random number generation aka regenerate the same random numbers every time to be able to reproduce the same results
def set_random_seed(seed=7):
    """Set random seed, for python, numpy

       fix random number generation aka regenerate the same random numbers every
       time to be able to reproduce the same results
    Args:
        seed (int): Seed to be used.

    """
    random.seed(seed)
    np.random.seed(seed)

seed=7
set_random_seed(seed=seed)

## load data

In [4]:
df = pd.read_csv("/content/drive/MyDrive/maids.cc/data/train - train.csv")
test_data = pd.read_csv("/content/drive/MyDrive/maids.cc/data/test - test.csv")

split the label column from the feature columns

In [5]:
# label column naming it (y)
y = df["price_range"]
#feature columns naming them x
x = df.drop("price_range", axis=1)

# number of features i will use
k = 20  # i tested many values, but almost all values less than 20 lead to poor results

#### using train test split function to split the training data into training and validation set

In [6]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.1, random_state=7, stratify=y, shuffle=True)

In [7]:
#we Will test the top k important features and train the model on them to get the best features for getting higher accuracy
most_important = SelectKBest(mutual_info_classif, k=k)
most_important.fit(xtrain.fillna(0), ytrain)
most_important_cols = xtrain.columns[most_important.get_support()]
#most_important_cols.scores_

In [8]:
# use only the best K featues and drop the other to reduce curse of dimentionality
xtrain = xtrain[most_important_cols]
xtest = xtest[most_important_cols]
# tried to increase the data, but it doesn't provide any gain to the performance
#xtrain = pd.concat([xtrain]*7, ignore_index=True)
#ytrain = pd.concat([ytrain]*7, ignore_index=True)

In [9]:
# we will do all the preprocessing steps using the make_column_transformer this has many advantages,
# such as we can involve the preprocessing step in the fine tuning process, and we dont need the repeat
# the same steps for testing data step by step

preprocessor_1 = make_column_transformer((MinMaxScaler(feature_range=(0, 3)), ["ram"]),
                                         ('passthrough', most_important_cols),
                                         remainder="drop")

preprocessor_1 = make_column_transformer((MinMaxScaler(feature_range=(0, 3)), ["ram"]), remainder="passthrough")

preprocessor_2 = make_column_transformer(
    #(KNNImputer(), slice(0, k)),  # i tested this choice, but it leads to poor results
     (IterativeImputer(random_state=seed, tol=1e-5, max_iter=500), slice(0, k)),
     remainder="passthrough")

preprocessor_3 = make_column_transformer((StandardScaler(), slice(1, k)),
                                         remainder="passthrough")

#### machine learning is a search problem

i was guess that linear model will perform well, as it do so when the number of training data is small

In [10]:
# i tested different algorithms and comment them, but keep the best one of them which is logistic regression
#clf = KNeighborsClassifier()  # test this choice, but it leads to poor results
#clf = SVC(random_state=seed) # test this choice, but it leads to poor results
#clf = GaussianNB()  # test this choice, but it leads to poor results
clf = LogisticRegression(random_state=seed, tol=1e-7, n_jobs=-1, max_iter=500, penalty=None, solver='lbfgs'
                         #, solver='sag'
                         )

# i tried many parameters values, and finally kept the best parameters values

pipe = make_pipeline(preprocessor_1, preprocessor_2, preprocessor_3, clf)
param_grid={#"logisticregression__penalty":['l1', 'l2', 'elasticnet'#, None],
            #"logisticregression__solver":['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'saga', 'sag'],
            #'logisticregression__C': [0.1, 10., 1.]
            #"logisticregression__multi_class":['auto', 'ovr', 'multinomial'],
            #"logisticregression__solver": ['lbfgs'], "logisticregression__penalty": ['l2', None] #96.23
            #"logisticregression__solver": ['liblinear'], "logisticregression__penalty": ['l2', 'l1']#82
            #"logisticregression__solver": ['newton-cg'], "logisticregression__penalty": ['l2', None]#95.7
            #"logisticregression__solver": ['newton-cholesky'], "logisticregression__penalty": ['l2', None]#83
            #"logisticregression__solver": ['sag'],
            #"logisticregression__penalty": ['l2', None]#96.7
            #"logisticregression__solver": ['saga'], "logisticregression__penalty": ['l2', None, 'elasticnet', 'l1'] 96
            }
search = GridSearchCV(pipe, param_grid, return_train_score=True, cv=33, refit=True)

In [11]:
# to show up the parameters names in case i need to test any of them
#pipe.get_params().keys()

In [12]:
# train the model and then print the training accuracy
search.fit(xtrain, ytrain)
score = search.best_score_
score

0.9677685950413224

In [13]:
# best model score
search.best_score_

0.9677685950413224

In [14]:
# best estimator
search.best_estimator_

In [15]:
# the parameters of the best model
search.best_estimator_.get_params(deep=True)

{'memory': None,
 'steps': [('columntransformer-1',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('minmaxscaler',
                                    MinMaxScaler(feature_range=(0, 3)), ['ram'])])),
  ('columntransformer-2',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('iterativeimputer',
                                    IterativeImputer(max_iter=500, random_state=7,
                                                     tol=1e-05),
                                    slice(0, 20, None))])),
  ('columntransformer-3',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('standardscaler', StandardScaler(),
                                    slice(1, 20, None))])),
  ('logisticregression',
   LogisticRegression(max_iter=500, n_jobs=-1, penalty=None, random_state=7,
                      tol=1e-07))],
 'verbose': False,
 'columntransformer-1': ColumnTransformer(remainder='passthrough',
 

#### model performance

In [16]:
best_model = search.best_estimator_
# calculation predictions
ypreds = best_model.predict(xtest)
# accuracy # percentage of correct labels relative to all prediction
acc = accuracy_score(ytest, ypreds)
# classification_report
clf_rport = classification_report(ytest, ypreds)
# f1 score = 2*recall*precision/(recall+precision)
f1score = f1_score(ytest, ypreds, average=None)
# Note:- f1-score metric is usefull when we deal with imbalanced dataset
print(f" accuracy = {acc}, \nf1score = {f1score}, \n\nclassification_report = {clf_rport}")

 accuracy = 0.99, 
f1score = [0.99009901 0.98989899 0.99009901 0.98989899], 

classification_report =               precision    recall  f1-score   support

           0       0.98      1.00      0.99        50
           1       1.00      0.98      0.99        50
           2       0.98      1.00      0.99        50
           3       1.00      0.98      0.99        50

    accuracy                           0.99       200
   macro avg       0.99      0.99      0.99       200
weighted avg       0.99      0.99      0.99       200



dispite of prediction of price_range is easy given this clear simple not-complex problem and balanced output column, but number of training set is small, i suggest to increase number of training set to increase the accuracy furthure

In [17]:
ypred = best_model.predict(test_data)
index = test_data["id"]
my_submission = pd.DataFrame({'id': index, "price_range": ypred})
my_submission.to_csv('submission.csv', index=False)

In [18]:
# to change to this directory
%cd /content/drive/MyDrive/maids.cc/data/savings/
#################################################################################
#save and load and use model using joblib
#################################################################################
# # # #save the trained model to disk
LogReg_filename = 'LogisticRegressionModel.joblib'
# # # save the model
#joblib.dump(best_model, LogReg_filename)
# # # to load the model from disk
best_model = joblib.load(LogReg_filename)
############
# # # inference process
labels = best_model.predict(xtest)
print(f"accuracy = {float(sum(labels==ytest)/len(ytest))*100:.2f}%")
#################################################################################
#################################################################################


#################################################################################
#save and load and use model using pickle
#################################################################################
# # # another way to save the trained model
LogReg_filename = 'LogisticRegressionModel.pkl'
# # # save the model
#pickle.dump(best_model, open(LogReg_filename, 'wb'))
# # # # to load the model from disk
best_model = pickle.load(open(LogReg_filename, 'rb'))
############################
# # # inference process
labels = best_model.predict(xtest)
print(f"accuracy = {float(sum(labels==ytest)/len(ytest))*100:.2f}%")

/content/drive/MyDrive/maids.cc/data/savings
accuracy = 99.00%
accuracy = 99.00%
