In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scikeras.wrappers import KerasClassifier

from sklearn import datasets, linear_model
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from keras.models import Sequential
from keras.layers import Dense, Dropout


In [50]:
# Load the dataset

train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

In [3]:
train_df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
train_df.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [5]:
X = train_df.drop(columns=['price_range'])
y = train_df['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

# Logistic Regression

In [74]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear']
}

# Create a GridSearchCV object
lr_model = GridSearchCV(estimator=linear_model.LogisticRegression(max_iter=1000), param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
lr_model.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", lr_model.best_params_)
print("Best accuracy found: ", lr_model.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'C': 100, 'solver': 'lbfgs'}
Best accuracy found:  0.9650000000000001


In [75]:
lr_y_pred = lr_model.predict(X_test)

print('Accuracy: ', accuracy_score(y_test, lr_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, lr_y_pred))

class_report = classification_report(y_test, lr_y_pred)
print('Classification Report:\n', class_report)

Accuracy:  0.96
Confusion Matrix: 
 [[101   4   0   0]
 [  0  91   0   0]
 [  0   7  85   0]
 [  0   0   5 107]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98       105
           1       0.89      1.00      0.94        91
           2       0.94      0.92      0.93        92
           3       1.00      0.96      0.98       112

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400



In [51]:
trueTest = test_df.drop(columns=['id'])

In [52]:
trueTest.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1043,1,1.8,1,14,0,5,0.1,193,3,16,226,1412,3476,12,7,2,0,1,0
1,841,1,0.5,1,4,1,61,0.8,191,5,12,746,857,3895,6,0,7,1,0,0
2,1807,1,2.8,0,1,0,27,0.9,186,3,4,1270,1366,2396,17,10,10,0,1,1
3,1546,0,0.5,1,18,1,25,0.5,96,8,20,295,1752,3893,10,0,7,1,1,0
4,1434,0,1.4,0,11,1,49,0.5,108,6,18,749,810,1773,15,8,7,1,0,1


In [53]:
trueTest = StandardScaler().fit_transform(trueTest)

In [59]:
tt_lr_y_pred = lr_model.predict(trueTest)

In [68]:
# Add the y_pred results as a new column to the trueTest dataframe
test_df_with_target_lr = test_df.copy()
test_df_with_target_lr['predicted_price_range'] = tt_lr_y_pred

# Display the updated dataframe
test_df_with_target_lr.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,predicted_price_range
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,226,1412,3476,12,7,2,0,1,0,3
1,2,841,1,0.5,1,4,1,61,0.8,191,...,746,857,3895,6,0,7,1,0,0,3
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,1270,1366,2396,17,10,10,0,1,1,2
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,295,1752,3893,10,0,7,1,1,0,3
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,749,810,1773,15,8,7,1,0,1,1


# SVC

In [57]:
# Define the parameter grid for SVC
param_grid_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Create a GridSearchCV object for SVC
svc_model = GridSearchCV(estimator=SVC(), param_grid=param_grid_svc, cv=5, n_jobs=-1, verbose=1)

# Fit the grid search to the data
svc_model.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", svc_model.best_params_)
print("Best accuracy found: ", svc_model.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters found:  {'C': 100, 'gamma': 1, 'kernel': 'linear'}
Best accuracy found:  0.9625


In [71]:
svc_y_pred = svc_model.predict(X_test)

In [73]:
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, svc_y_pred)
print('Confusion Matrix:\n', conf_matrix)

# Calculate classification report
class_report = classification_report(y_test, svc_y_pred)
print('Classification Report:\n', class_report)

Confusion Matrix:
 [[103   2   0   0]
 [  0  91   0   0]
 [  0   9  82   1]
 [  0   0   5 107]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99       105
           1       0.89      1.00      0.94        91
           2       0.94      0.89      0.92        92
           3       0.99      0.96      0.97       112

    accuracy                           0.96       400
   macro avg       0.96      0.96      0.96       400
weighted avg       0.96      0.96      0.96       400



In [69]:
tt_svc_y_pred = lr_model.predict(trueTest)

In [70]:
# Add the y_pred results as a new column to the trueTest dataframe
test_df_with_target_svc = test_df.copy()
test_df_with_target_svc['predicted_price_range'] = tt_svc_y_pred

# Display the updated dataframe
test_df_with_target_svc.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,predicted_price_range
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,226,1412,3476,12,7,2,0,1,0,3
1,2,841,1,0.5,1,4,1,61,0.8,191,...,746,857,3895,6,0,7,1,0,0,3
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,1270,1366,2396,17,10,10,0,1,1,2
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,295,1752,3893,10,0,7,1,1,0,3
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,749,810,1773,15,8,7,1,0,1,1


# Neural Network

In [None]:
# Define the model
def create_model(optimizer='adam', dropout_rate=0.2):
    model = Sequential()
    model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation='softmax'))  # Output layer with softmax for 4 classes
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Create the KerasClassifier
model = KerasClassifier(model=create_model, verbose=0)

# Define the parameter grid for the neural network
param_grid_nn = {
    'model__optimizer': ['adam', 'rmsprop'],
    'model__dropout_rate': [0.2, 0.5],
    'batch_size': [32, 64],
    'epochs': [10, 20]
}

# Create GridSearchCV object for the neural network
nn_model = GridSearchCV(estimator=model, param_grid=param_grid_nn, cv=3, verbose=1, n_jobs=-1)

In [None]:
# Fit the grid search to the data
nn_model.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", nn_model.best_params_)
print("Best accuracy found: ", nn_model.best_score_)

In [None]:
# Predict on the test set
nn_y_pred = nn_model.predict(X_test)

# Evaluate the model
print('Accuracy: ', accuracy_score(y_test, nn_y_pred))
print('Confusion Matrix: \n', confusion_matrix(y_test, nn_y_pred))
print('Classification Report:\n', classification_report(y_test, nn_y_pred))

In [None]:
# Predict on the true test set
tt_nn_y_pred = nn_model.predict(trueTest)

# Add the predictions to the true test DataFrame
test_df_with_target_nn = test_df.copy()
test_df_with_target_nn['predicted_price_range'] = tt_nn_y_pred

# Display the updated DataFrame
test_df_with_target_nn.head()