In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("..\Data Cleaning\smartphones_feature_selection_cleaned.csv")

In [3]:
df.head(10)

Unnamed: 0,brand_name,price,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,79990,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,16990,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,23799,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,16999,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,10490,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8
5,lenovo,59999,1,1,snapdragon,2.84,16,512,5000,120,2,6.5,FHD,144,android,0,64,20
6,samsung,23990,1,1,snapdragon,2.4,8,128,5000,25,3,6.7,FHD,120,android,1024,64,32
7,micromax,5790,0,0,tiger,1.8,3,32,5000,24,1,6.52,HD,60,android,256,8,5
8,sony,39999,1,1,snapdragon,2.2,6,128,5000,21,3,6.1,FHD,60,android,0,48,8
9,xiaomi,13458,0,0,helio,2.0,6,128,6000,18,4,6.5,FHD,90,android,512,50,8


In [4]:
X = df.drop(columns=['price'])
y = df['price']

In [5]:
# Applying the log1p transformation to the variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [6]:
columns_to_encode = ['brand_name','has_5g','has_nfc','fast_charging','processor_brand','resolution','os']

In [7]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [8]:
preprocessor = ColumnTransformer(
    transformers = [
        ("num",StandardScaler(),['processor_speed','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ("cat",OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),columns_to_encode)
    ],
    remainder='passthrough'
)

In [9]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor',LinearRegression())
])

In [10]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [11]:
scores.mean(), scores.std()

(0.8355291237167648, 0.022872578479514075)

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [13]:
pipeline.fit(X_train,y_train)

In [14]:
y_pred = pipeline.predict(X_test)

In [15]:
y_pred = np.expm1(y_pred)

In [16]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

8248.818173530139

In [17]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [18]:
from sklearn.linear_model import Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [19]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [20]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [21]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [22]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.9098,5839.043528
6,extra trees,0.906215,5920.10683
10,xgboost,0.916762,6105.192028
7,gradient boosting,0.905757,6217.243863
1,svr,0.858525,7651.295602
8,adaboost,0.833894,7798.761975
4,decision tree,0.826421,8016.265773
2,ridge,0.835534,8248.169903
0,linear_reg,0.835529,8248.818174
9,mlp,0.838375,8534.40766


### OneHotEncoding

In [23]:
X.head()

Unnamed: 0,brand_name,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8


In [24]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='infrequent_if_exist'), ['brand_name','processor_brand','resolution','os'])
    ], 
    remainder='passthrough'
)

In [25]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [26]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [27]:
scores.mean()

0.8873522353490957

In [28]:
scores.std()

0.015220176178572947

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [30]:
pipeline.fit(X_train,y_train)

In [31]:
y_pred = pipeline.predict(X_test)



In [32]:
y_pred = np.expm1(y_pred)

In [33]:
mean_absolute_error(np.expm1(y_test),y_pred)

6557.490304378323

In [34]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [35]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [36]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [37]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.913348,5849.357002
10,xgboost,0.91786,5890.106124
6,extra trees,0.911761,5982.978041
7,gradient boosting,0.907429,6169.372203
0,linear_reg,0.887352,6557.490304
2,ridge,0.888689,6584.313334
1,svr,0.872373,7257.019611
9,mlp,0.866486,7262.330748
4,decision tree,0.837512,7312.043304
8,adaboost,0.839537,7839.968461


### OneHotEncoding with PCA

In [52]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed','ram_capacity','internal_memory','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'), ['brand_name','processor_brand','os'])
    ], 
    remainder='passthrough'
)

In [53]:
from sklearn.decomposition import PCA

In [54]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [55]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [56]:
scores.mean()

0.6544060287826597

In [57]:
scores.std()

0.03667427435803322

In [58]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [59]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [60]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [61]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [62]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.856788,7342.901826
5,random forest,0.841762,7793.463217
7,gradient boosting,0.831369,7988.270146
10,xgboost,0.847834,8013.393462
1,svr,0.795002,8421.314512
4,decision tree,0.718149,9965.538206
8,adaboost,0.683833,10377.420516
0,linear_reg,0.654406,10841.128469
2,ridge,0.654407,10841.273555
9,mlp,0.568027,13193.567602


### Target Encoder

In [63]:
from sklearn.preprocessing import TargetEncoder
import category_encoders as ce

In [64]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed','ram_capacity','internal_memory','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['processor_brand','os']),
        ('target_enc', ce.TargetEncoder(), ['brand_name','fast_charging'])
    ], 
    remainder='passthrough'
)

In [65]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [66]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [67]:
scores.mean(),scores.std()

(0.8660520009439073, 0.017033265538299296)

In [68]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [69]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [70]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [71]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.911447,5742.090764
10,xgboost,0.917132,5772.499431
5,random forest,0.916277,5879.592382
7,gradient boosting,0.907804,6024.019136
9,mlp,0.84867,7244.569486
0,linear_reg,0.866052,7283.205903
2,ridge,0.866115,7315.576896
8,adaboost,0.840938,7433.754677
4,decision tree,0.851404,7511.156064
1,svr,0.823602,9042.939015


### Hyperparamter Tuning

In [72]:
from sklearn.model_selection import GridSearchCV

In [92]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__bootstrap' : [True],
    'regressor__max_samples': [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['log2', 'sqrt']
}

In [93]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed','ram_capacity','internal_memory','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False), ['processor_brand','os']),
        ('target_enc', ce.TargetEncoder(), ['brand_name','fast_charging'])
    ], 
    remainder='passthrough'
)

In [94]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [95]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [96]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [97]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


Parameters: { "bootstrap", "max_features", "max_samples" } are not used.



In [98]:
final_pipe = search.best_estimator_

In [99]:
search.best_params_

{'regressor__bootstrap': True,
 'regressor__max_depth': None,
 'regressor__max_features': 'log2',
 'regressor__max_samples': 0.1,
 'regressor__n_estimators': 100}

In [100]:
search.best_score_

0.9171317962589249

### Extracting the Model

In [208]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['processor_speed','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['brand_name','has_5g','has_nfc','fast_charging','processor_brand','resolution','os']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='infrequent_if_exist'), ['brand_name','processor_brand','resolution','os'])
    ], 
    remainder='passthrough'
)

In [209]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [210]:
pipeline.fit(X,y_transformed)

In [211]:
import pickle
with open('..\pipeline.pkl','wb') as file:
    pickle.dump(pipeline,file)

In [222]:

df.head(10)

Unnamed: 0,brand_name,price,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,79990,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,16990,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,23799,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,16999,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,10490,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8
5,lenovo,59999,1,1,snapdragon,2.84,16,512,5000,120,2,6.5,FHD,144,android,0,64,20
6,samsung,23990,1,1,snapdragon,2.4,8,128,5000,25,3,6.7,FHD,120,android,1024,64,32
7,micromax,5790,0,0,tiger,1.8,3,32,5000,24,1,6.52,HD,60,android,256,8,5
8,sony,39999,1,1,snapdragon,2.2,6,128,5000,21,3,6.1,FHD,60,android,0,48,8
9,xiaomi,13458,0,0,helio,2.0,6,128,6000,18,4,6.5,FHD,90,android,512,50,8


In [220]:
with open('..\whole_df.pkl','wb') as file:
    pickle.dump(df,file)

### Trying out the predictions

In [91]:
X.columns

Index(['brand_name', 'rating', 'has_5g', 'has_nfc', 'processor_brand',
       'processor_speed', 'ram_capacity', 'internal_memory',
       'battery_capacity', 'fast_charging', 'num_rear_cameras', 'screen_size',
       'resolution', 'refresh_rate', 'os', 'extended_upto',
       'primary_camera_rear', 'primary_camera_front'],
      dtype='object')

In [112]:
X.iloc[0].values

array(['oneplus', 'best', 1, 1, 'dimensity', 3.2, 12, 256, 4500, 67, 2,
       6.78, 'FHD', 120, 'other', 0, 50, 32], dtype=object)

In [126]:
X.sample(5)

Unnamed: 0,brand_name,rating,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
1691,motorola,better,1,1,snapdragon,3.2,8,128,4450,125,3,6.67,FHD,144,android,0,200,60
1464,xiaomi,best,0,0,helio,2.05,8,128,5000,33,4,6.43,FHD,90,android,512,108,16
2091,realme,best,1,0,dimensity,2.4,8,128,5000,33,3,6.5,FHD,90,android,1024,64,16
745,lava,better,0,0,helio,2.0,3,32,5000,10,3,6.52,HD+,60,android,512,13,8
1757,samsung,Excellent,0,0,helio,2.2,8,128,5000,25,3,6.5,FHD,60,android,1024,50,13


In [117]:
X.columns

Index(['brand_name', 'rating', 'has_5g', 'has_nfc', 'processor_brand',
       'processor_speed', 'ram_capacity', 'internal_memory',
       'battery_capacity', 'fast_charging', 'num_rear_cameras', 'screen_size',
       'resolution', 'refresh_rate', 'os', 'extended_upto',
       'primary_camera_rear', 'primary_camera_front'],
      dtype='object')

In [148]:
data = [['realme','best',1,0,'dimensity',2.4,6,128,5000,33,3,6.5,'FHD',90,'android',1024,64,16]]
columns = ['brand_name', 'rating', 'has_5g', 'has_nfc', 'processor_brand',
       'processor_speed', 'ram_capacity', 'internal_memory',
       'battery_capacity', 'fast_charging', 'num_rear_cameras', 'screen_size',
       'resolution', 'refresh_rate', 'os', 'extended_upto',
       'primary_camera_rear', 'primary_camera_front']

one_df = pd.DataFrame(data,columns = columns)

In [149]:
one_df

Unnamed: 0,brand_name,rating,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,realme,best,1,0,dimensity,2.4,6,128,5000,33,3,6.5,FHD,90,android,1024,64,16


In [150]:
np.expm1(pipeline.predict(one_df))

array([17311.05], dtype=float32)

In [156]:
X['brand_name'].unique()

array(['oneplus', 'tecno', 'xiaomi', 'realme', 'lenovo', 'samsung',
       'micromax', 'sony', 'poco', 'vivo', 'motorola', 'huawei', 'honor',
       'itel', 'oppo', 'asus', 'gionee', 'infinix', 'apple', 'iqoo',
       'nokia', 'nubia', 'lava', 'ikall', 'google', 'nothing', 'lg',
       'meizu', 'blackberry', 'htc', 'jio', 'tesla'], dtype=object)

In [160]:
np.ceil(X['processor_speed'].max())

4.0

In [168]:
df['brand_name'].str.capitalize().unique()

array(['Oneplus', 'Tecno', 'Xiaomi', 'Realme', 'Lenovo', 'Samsung',
       'Micromax', 'Sony', 'Poco', 'Vivo', 'Motorola', 'Huawei', 'Honor',
       'Itel', 'Oppo', 'Asus', 'Gionee', 'Infinix', 'Apple', 'Iqoo',
       'Nokia', 'Nubia', 'Lava', 'Ikall', 'Google', 'Nothing', 'Lg',
       'Meizu', 'Blackberry', 'Htc', 'Jio', 'Tesla'], dtype=object)

In [169]:
X['internal_memory'].value_counts()

internal_memory
128     1084
256      534
64       414
32       175
512       79
16        31
1024      10
8          6
258        1
Name: count, dtype: int64

In [188]:
X['internal_memory'].value_counts().index.sort_values().to_list()

[8, 16, 32, 64, 128, 256, 258, 512, 1024]

In [190]:
X['battery_capacity'].value_counts().index.sort_values().to_list()

[1600,
 1715,
 1821,
 1900,
 2000,
 2018,
 2050,
 2100,
 2110,
 2200,
 2227,
 2230,
 2350,
 2400,
 2470,
 2500,
 2510,
 2600,
 2650,
 2691,
 2730,
 2750,
 2800,
 2815,
 2915,
 2942,
 2980,
 3000,
 3010,
 3020,
 3040,
 3060,
 3080,
 3095,
 3100,
 3110,
 3140,
 3200,
 3240,
 3260,
 3274,
 3279,
 3300,
 3334,
 3340,
 3349,
 3400,
 3410,
 3430,
 3440,
 3450,
 3500,
 3505,
 3600,
 3687,
 3700,
 3760,
 3765,
 3800,
 3900,
 4000,
 4020,
 4025,
 4030,
 4045,
 4050,
 4100,
 4150,
 4190,
 4200,
 4230,
 4235,
 4250,
 4260,
 4290,
 4300,
 4310,
 4315,
 4320,
 4325,
 4350,
 4352,
 4355,
 4383,
 4385,
 4400,
 4410,
 4422,
 4430,
 4440,
 4450,
 4460,
 4470,
 4500,
 4520,
 4575,
 4600,
 4610,
 4620,
 4700,
 4730,
 4750,
 4800,
 4805,
 4810,
 4815,
 4820,
 4821,
 4830,
 4850,
 4860,
 4870,
 4880,
 4900,
 4950,
 4980,
 5000,
 5003,
 5020,
 5050,
 5060,
 5065,
 5080,
 5100,
 5150,
 5160,
 5180,
 5200,
 5240,
 5300,
 5400,
 5450,
 5500,
 5580,
 5600,
 5800,
 6000,
 6500,
 6700,
 7000,
 7100]

In [192]:
X['fast_charging'].value_counts().index.sort_values().to_list()

[8,
 10,
 14,
 15,
 18,
 19,
 20,
 21,
 22,
 24,
 25,
 27,
 30,
 33,
 35,
 38,
 40,
 44,
 45,
 50,
 55,
 57,
 60,
 65,
 66,
 67,
 68,
 73,
 80,
 88,
 90,
 100,
 120,
 125,
 135,
 150,
 165,
 180,
 200,
 210,
 240,
 250,
 260]

In [193]:
X['num_rear_cameras'].unique()

array([2, 3, 1, 4], dtype=int64)

In [195]:
X['screen_size'].value_counts().index.sort_values().to_list()

[4.0,
 4.5,
 4.7,
 5.0,
 5.2,
 5.3,
 5.4,
 5.42,
 5.45,
 5.5,
 5.7,
 5.8,
 5.83,
 5.9,
 5.92,
 5.99,
 6.0,
 6.08,
 6.09,
 6.1,
 6.12,
 6.14,
 6.18,
 6.2,
 6.21,
 6.22,
 6.26,
 6.28,
 6.3,
 6.34,
 6.35,
 6.36,
 6.38,
 6.39,
 6.4,
 6.41,
 6.43,
 6.44,
 6.47,
 6.49,
 6.5,
 6.51,
 6.52,
 6.53,
 6.55,
 6.56,
 6.57,
 6.58,
 6.59,
 6.6,
 6.61,
 6.62,
 6.64,
 6.66,
 6.67,
 6.68,
 6.69,
 6.7,
 6.71,
 6.72,
 6.73,
 6.74,
 6.75,
 6.76,
 6.77,
 6.78,
 6.79,
 6.8,
 6.81,
 6.82,
 6.83,
 6.9,
 6.92,
 6.95,
 7.0,
 7.09,
 7.1,
 7.11,
 7.2,
 7.3,
 7.4,
 7.56,
 7.6,
 7.63,
 7.71,
 7.8,
 7.82,
 7.85,
 7.9,
 7.92,
 8.0,
 8.01,
 8.02,
 8.03,
 8.2,
 8.3]

In [196]:
X['resolution'].unique()

array(['FHD', 'HD', 'HD+', 'QHD', 'FHD+', 'UHD'], dtype=object)

In [197]:
X['refresh_rate'].unique()

array([120,  90, 144,  60, 165], dtype=int64)

In [198]:
X['os'].unique()

array(['other', 'android', 'ios'], dtype=object)

In [200]:
X['extended_upto'].value_counts().index.sort_values().to_list()

[0, 32, 64, 128, 256, 400, 512, 1024, 2048]

In [202]:
X['primary_camera_rear'].value_counts().index.sort_values().to_list()

[2,
 5,
 8,
 12,
 13,
 16,
 19,
 20,
 21,
 24,
 25,
 32,
 40,
 48,
 50,
 54,
 64,
 100,
 108,
 160,
 180,
 200]

In [214]:
X.sample(30)

Unnamed: 0,brand_name,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
193,infinix,0,0,tiger,1.82,4,64,5000,33,2,6.7,FHD,90,android,1024,13,8
530,realme,1,0,dimensity,2.5,6,128,5000,33,3,6.4,FHD,90,android,0,48,16
2020,vivo,0,0,dimensity,2.4,8,128,4000,33,3,6.44,FHD,90,android,0,64,44
2152,lenovo,1,1,snapdragon,3.0,12,256,5500,125,2,6.92,FHD,144,android,0,64,44
2045,vivo,0,0,helio,2.0,6,128,5000,44,2,6.64,FHD,60,android,0,50,8
2323,xiaomi,1,1,snapdragon,3.2,8,128,5000,120,3,6.67,QHD,120,android,0,108,20
1469,samsung,1,1,snapdragon,3.2,12,512,4400,25,3,7.6,QHD,120,android,0,50,10
1832,apple,1,1,bionic,3.22,6,256,3279,25,2,6.1,HD,60,ios,0,12,12
21,itel,0,0,unisoc,1.3,2,32,4000,18,3,6.6,HD+,60,android,32,8,8
727,realme,0,0,sc9863a,1.6,4,64,5000,33,1,6.52,HD,60,android,0,8,5


In [223]:
df.head()

Unnamed: 0,brand_name,price,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,79990,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,16990,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,23799,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,16999,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,10490,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8


In [None]:
new_df = df.drop(columns=['brand_name','price'])
new_df

In [226]:
new_df[['processor_brand','resolution','os']]

Unnamed: 0,processor_brand,resolution,os
0,dimensity,FHD,other
1,helio,FHD,android
2,dimensity,FHD,android
3,dimensity,FHD,android
4,snapdragon,FHD,android
...,...,...,...
2329,snapdragon,FHD,android
2330,exynos,FHD,android
2331,helio,HD,android
2332,snapdragon,FHD,android
