In [12]:
import pandas as pd
import numpy as np

In [13]:
raw_data = pd.read_csv('mobile_price.csv')
raw_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [14]:
# get the shape
print(raw_data.shape)


(2000, 21)


In [15]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [16]:
data = raw_data.copy(deep = True)

In [17]:
data['price_range'] = data['price_range'].astype("object")

In [18]:
# get the count of missing values
missing_values = data.isnull().sum()

# print the count of missing values
print(missing_values)

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64


In [19]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [20]:
from sklearn.model_selection import train_test_split
# Split the data into 40% test and 60% training
X = data.drop(['price_range'],axis = 1)
y = data['price_range'].astype('int64')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [21]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)


RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state=0)

In [22]:
feat_labels = X.columns.values
# Print the name and gini importance of each feature
feature_importance = []
for feature in zip(feat_labels, clf.feature_importances_):
    #rint(feature)
    feature_importance.append(feature)

In [23]:
from sklearn.feature_selection import SelectFromModel
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(clf, threshold=0.01)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=10000, n_jobs=-1,
                                                 random_state=0),
                threshold=0.01)

In [24]:
selected_features = []
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    selected_features.append(feat_labels[feature_list_index])

data_selected = data[selected_features]
data_selected.head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time
0,842,2.2,1,7,0.6,188,2,2,20,756,2549,9,7,19
1,1021,0.5,0,53,0.7,136,3,6,905,1988,2631,17,3,7
2,563,0.5,2,41,0.9,145,5,6,1263,1716,2603,11,2,9
3,615,2.5,0,10,0.8,131,6,9,1216,1786,2769,16,8,11
4,1821,1.2,13,44,0.6,141,2,14,1208,1212,1411,8,2,15


In [25]:
selected_features

['battery_power',
 'clock_speed',
 'fc',
 'int_memory',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram',
 'sc_h',
 'sc_w',
 'talk_time']

In [29]:
data_selected.set_index('battery_power').to_csv('scale.csv')

In [26]:
# Standardise the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(data_selected)

MinMaxScaler()

In [27]:
data_standardised = scaler.fit_transform(data_selected)

In [32]:
from sklearn.model_selection import train_test_split
# let us now split the dataset into train & test
X = data_standardised
y = data['price_range'].astype('int64')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)

# print the shape of 'x_train'
print("X_train ",X_train.shape)

# print the shape of 'x_test'
print("X_test ",X_test.shape)

# print the shape of 'y_train'
print("y_train ",y_train.shape)

# print the shape of 'y_test'
print("y_test ",y_test.shape)

X_train  (1400, 14)
X_test  (600, 14)
y_train  (1400,)
y_test  (600,)


In [33]:
from sklearn.svm import SVC
# define model
svm = SVC(kernel="linear",decision_function_shape='ovo')
# fit model
svm.fit(X_train, y_train)

SVC(decision_function_shape='ovo', kernel='linear')

In [34]:
# predict the values
y_pred_svm  = svm.predict(X_test)

from sklearn import metrics
svm_metrics = pd.Series({'Model': "SVM",   
                 'Precision Score': metrics.precision_score(y_test, y_pred_svm,average="macro"),
                 'Recall Score': metrics.recall_score(y_test, y_pred_svm,average="macro"),
                 'Accuracy Score': metrics.accuracy_score(y_test, y_pred_svm),
                  'f1-score':metrics.f1_score(y_test, y_pred_svm,average="macro")})

In [35]:
# appending our result table
result_tabulation = pd.DataFrame(svm_metrics)

# view the result table
result_tabulation

Unnamed: 0,0
Model,SVM
Precision Score,0.956965
Recall Score,0.957732
Accuracy Score,0.956667
f1-score,0.957261


In [36]:
import pickle
pickle.dump(svm,open('Mobile_price.pkl','wb'))