In [61]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

In [28]:
df = pd.read_csv("..\Data Cleaning\smartphones_feature_selection_cleaned.csv")

In [29]:
df.head()

Unnamed: 0,brand_name,price,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,79990,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,16990,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,23799,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,16999,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,10490,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8


In [30]:
# Numerical - price,processor_speed,internal_memory,fast_charging,screen_size,refresh_rate,primary_camera_rear,primary_camera_front
# Ordinal - resolution,ram_capacity,battery_capacity
# OneHotEncoding - brand_name,processor_brand,os

In [133]:
X = df.drop(columns=['price'])
y = df['price']

In [143]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
import pandas as pd

# Numerical features
numerical_features = ['processor_speed','battery_capacity','ram_capacity','internal_memory','num_rear_cameras','screen_size','refresh_rate','extended_upto','primary_camera_rear','primary_camera_front']
X_numerical = X[numerical_features]

# Categorical features for ordinal encoding
ordinal_features = ['brand_name','has_5g','has_nfc','fast_charging','processor_brand','resolution','os']
X_ordinal = X[ordinal_features]

# Categorical features for one-hot encoding
one_hot_features = ['brand_name','processor_brand','resolution','os']
X_one_hot = X[one_hot_features]

# Apply StandardScaler to numerical features and maintain column names
scaler = StandardScaler()
X_numerical_scaled = pd.DataFrame(scaler.fit_transform(X_numerical), columns=numerical_features)

# Apply OrdinalEncoder to ordinal categorical features and maintain column names
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_ordinal_encoded = pd.DataFrame(ordinal_encoder.fit_transform(X_ordinal), columns=ordinal_features)

# Apply OneHotEncoder to one-hot categorical features and maintain column names
one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='infrequent_if_exist')
X_one_hot_encoded = pd.DataFrame(one_hot_encoder.fit_transform(X_one_hot), columns=one_hot_encoder.get_feature_names_out(one_hot_features))

# Concatenate all the transformed features
X_scaled = pd.concat([X_numerical_scaled, X_ordinal_encoded, X_one_hot_encoded], axis=1)

# Now you can use X_manual_preprocessed in your model training


In [144]:
X_scaled.head()

Unnamed: 0,processor_speed,battery_capacity,ram_capacity,internal_memory,num_rear_cameras,screen_size,refresh_rate,extended_upto,primary_camera_rear,primary_camera_front,brand_name,has_5g,has_nfc,fast_charging,processor_brand,resolution,os,brand_name_asus,brand_name_blackberry,brand_name_gionee,brand_name_google,brand_name_honor,brand_name_htc,brand_name_huawei,brand_name_ikall,brand_name_infinix,brand_name_iqoo,brand_name_itel,brand_name_jio,brand_name_lava,brand_name_lenovo,brand_name_lg,brand_name_meizu,brand_name_micromax,brand_name_motorola,brand_name_nokia,brand_name_nothing,brand_name_nubia,brand_name_oneplus,brand_name_oppo,brand_name_poco,brand_name_realme,brand_name_samsung,brand_name_sony,brand_name_tecno,brand_name_tesla,brand_name_vivo,brand_name_xiaomi,processor_brand_dimensity,processor_brand_exynos,processor_brand_fusion,processor_brand_google,processor_brand_helio,processor_brand_intel,processor_brand_jlq,processor_brand_kirin,processor_brand_mediatek,processor_brand_sc9863a,processor_brand_snapdragon,processor_brand_spreadtrum,processor_brand_tiger,processor_brand_unisoc,resolution_FHD+,resolution_HD,resolution_HD+,resolution_QHD,resolution_UHD,os_ios,os_other
0,1.564643,-0.438184,1.62283,0.891281,-0.858658,0.568101,0.834963,-0.763061,-0.043744,1.320423,22.0,1.0,1.0,25.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.448708,0.306656,0.349979,0.891281,0.430987,0.269146,0.834963,-0.763061,0.343309,1.320423,28.0,0.0,1.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.356632,0.306656,0.349979,0.891281,0.430987,0.269146,0.834963,-0.763061,-0.043744,-0.058411,31.0,1.0,0.0,25.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.448708,0.306656,-0.922871,-0.225341,0.430987,-0.192876,-0.160544,1.306535,-0.099038,-0.058411,25.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.851378,0.00872,-0.922871,-0.783653,0.430987,-0.192876,-0.160544,1.306535,-0.099038,-0.747829,31.0,1.0,0.0,4.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
y_log = np.log1p(y)

In [146]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [147]:
scores.mean(),scores.std()

(0.8872505511357607, 0.015317669145214513)

In [148]:
lr = LinearRegression()

In [149]:
lr.fit(X_scaled,y_log)

In [150]:
lr.coef_.shape

(69,)

In [151]:
coef_df = pd.DataFrame(lr.coef_.reshape(1,69),columns=X_scaled.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'features',0:'coef'})

In [152]:
coef_df

Unnamed: 0,features,coef
0,processor_speed,0.145443
1,battery_capacity,-0.114276
2,ram_capacity,0.114576
3,internal_memory,0.069788
4,num_rear_cameras,0.077708
5,screen_size,0.122028
6,refresh_rate,0.005482
7,extended_upto,-0.045148
8,primary_camera_rear,0.042114
9,primary_camera_front,0.017647


In [58]:
df['battery_capacity'].mean()

4794.146529562982

In [84]:
df['processor_speed'].std()

0.49679086920049714

In [157]:
(0.344097 * df['price'].std())/X_scaled['brand_name_google'].std()- 6557.490304

105030.02292862264

In [161]:
(0.344097 * (y.std()/X_scaled['brand_name'].std())) + 2557.490304

1171.9538025919596

In [132]:
X

Unnamed: 0,brand_name,has_5g,has_nfc,processor_brand,processor_speed,ram_capacity,internal_memory,battery_capacity,fast_charging,num_rear_cameras,screen_size,resolution,refresh_rate,os,extended_upto,primary_camera_rear,primary_camera_front
0,oneplus,1,1,dimensity,3.2,12,256,4500,67,2,6.78,FHD,120,other,0,50,32
1,tecno,0,1,helio,2.2,8,256,5000,33,3,6.67,FHD,120,android,0,64,32
2,xiaomi,1,0,dimensity,2.6,8,256,5000,67,3,6.67,FHD,120,android,0,50,16
3,realme,1,0,dimensity,2.2,4,128,5000,18,3,6.5,FHD,90,android,1024,48,16
4,xiaomi,1,0,snapdragon,2.0,4,64,4800,18,3,6.5,FHD,90,android,1024,48,8
5,lenovo,1,1,snapdragon,2.84,16,512,5000,120,2,6.5,FHD,144,android,0,64,20
6,samsung,1,1,snapdragon,2.4,8,128,5000,25,3,6.7,FHD,120,android,1024,64,32
7,micromax,0,0,tiger,1.8,3,32,5000,24,1,6.52,HD,60,android,256,8,5
8,sony,1,1,snapdragon,2.2,6,128,5000,21,3,6.1,FHD,60,android,0,48,8
9,xiaomi,0,0,helio,2.0,6,128,6000,18,4,6.5,FHD,90,android,512,50,8


## Regression Analysis

In [119]:
import statsmodels.api as sm
X_with_const = sm.add_constant(X_scaled)
model = sm.OLS(y_log,X_with_const).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.898
Model:                            OLS   Adj. R-squared:                  0.895
Method:                 Least Squares   F-statistic:                     320.9
Date:                Mon, 19 Feb 2024   Prob (F-statistic):               0.00
Time:                        02:08:33   Log-Likelihood:                -152.44
No. Observations:                2334   AIC:                             430.9
Df Residuals:                    2271   BIC:                             793.5
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               