In [276]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2

In [277]:
sales_data = pd.read_csv('sales_data.csv')

In [278]:
sales_data.head()

Unnamed: 0,Order_ID,Branch,Order_Date,Order_Priority,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,...,GPU,OS,OS_Version,Weight,Price,Quantity,Discount,Total_Price,Profit,Ship_Duration
0,0,Hamedan,1396-10-26,H,Dell,Vostro 3568,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,...,AMD Radeon R5 M420,Windows,10,2.18kg,3450000,1,0,3450000.0,250000,3
1,1,Kermanshah,1394-04-21,H,Lenovo,Yoga 720-13IKB,2 in 1 Convertible,"13.3""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,...,Intel HD Graphics 620,Windows,10,1.3kg,3400000,1,0,3400000.0,380000,2
2,2,Rasht,1402-05-24,M,Lenovo,IdeaPad Y700-15ISK,Notebook,"15.6""",IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,...,Nvidia GeForce GTX 960M,Windows,10,2.6kg,44900000,1,0,44900000.0,5630000,5
3,3,Kermanshah,1397-06-17,H,Dell,Inspiron 5370,Ultrabook,"13.3""",IPS Panel Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,...,AMD Radeon 530,Windows,10,1.4kg,12550000,1,0,12550000.0,1130000,3
4,4,Karaj,1392-09-30,L,Acer,Aspire E5-576G,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,...,Nvidia GeForce 940MX,Windows,10,2.23kg,2050000,1,0,2050000.0,180000,8


In [279]:
sales_data['Order_Date'] = sales_data['Order_Date'].apply(lambda x: int(''.join(x.split('-'))))
sales_data['Screen_Size'] = sales_data['Screen_Size'].apply(lambda x: float(x.split('"')[0]))
sales_data['RAM'] = sales_data['RAM'].apply(lambda x: int(x.split('GB')[0]))
sales_data['Weight'] = sales_data['Weight'].apply(lambda x: float(x.split('kg')[0]))

In [280]:
sales_data['Screen_Size'] = sales_data['Screen_Size'].apply(lambda x: int(x*10))
sales_data['Weight'] = sales_data['Weight'].apply(lambda x: int(x*100))
sales_data['Total_Price'] = sales_data['Total_Price'].apply(lambda x: int(x*10))

In [281]:
sales_data.drop(columns=['Order_ID', 'Branch', 'Order_Date', 'Order_Priority', 'Quantity', 'Total_Price'], inplace=True)

In [282]:
sales_data.columns

Index(['Manufacturer', 'Model_Name', 'Category', 'Screen_Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'OS', 'OS_Version', 'Weight', 'Price',
       'Discount', 'Profit', 'Ship_Duration'],
      dtype='object')

In [283]:
sales_data.head()

Unnamed: 0,Manufacturer,Model_Name,Category,Screen_Size,Screen,CPU,RAM,Storage,GPU,OS,OS_Version,Weight,Price,Discount,Profit,Ship_Duration
0,Dell,Vostro 3568,Notebook,156,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,AMD Radeon R5 M420,Windows,10,218,3450000,0,250000,3
1,Lenovo,Yoga 720-13IKB,2 in 1 Convertible,133,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,Windows,10,130,3400000,0,380000,2
2,Lenovo,IdeaPad Y700-15ISK,Notebook,156,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8,1TB HDD,Nvidia GeForce GTX 960M,Windows,10,260,44900000,0,5630000,5
3,Dell,Inspiron 5370,Ultrabook,133,IPS Panel Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8,256GB SSD,AMD Radeon 530,Windows,10,140,12550000,0,1130000,3
4,Acer,Aspire E5-576G,Notebook,156,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,4,256GB SSD,Nvidia GeForce 940MX,Windows,10,223,2050000,0,180000,8


In [315]:
# number of each screen_size
sales_data['Screen_Size'].value_counts()
# sales_data['Screen_Size'].describe

Screen_Size
156    543817
140    152895
133    145485
173    110889
125     26470
116     11392
120      6751
135      4681
139      3642
123      3102
101      1935
150      1855
130      1682
113      1179
154       615
170       501
141       114
184        76
Name: count, dtype: int64

In [284]:
X_encoded = sales_data.copy()

label_encoders = {}
categorical_cols = ['Manufacturer', 'Model_Name', 'Category', 'Screen', 'CPU', 'Storage', 'GPU', 'OS', 'OS_Version']
for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col + "_en"] = le.fit_transform(X_encoded[col])
    label_encoders[col] = le

X_encoded.drop(columns=['Manufacturer', 'Model_Name', 'Category', 'Screen', 'CPU', 'Storage', 'GPU', 'OS', 'OS_Version'],inplace=True)

In [285]:
X_encoded.columns

Index(['Screen_Size', 'RAM', 'Weight', 'Price', 'Discount', 'Profit',
       'Ship_Duration', 'Manufacturer_en', 'Model_Name_en', 'Category_en',
       'Screen_en', 'CPU_en', 'Storage_en', 'GPU_en', 'OS_en',
       'OS_Version_en'],
      dtype='object')

In [286]:
y = X_encoded['Profit']
X = X_encoded.drop(columns=['Profit'])

In [272]:
selector = VarianceThreshold(threshold=0.4)
X_new = selector.fit_transform(X)

In [273]:
# selector = SelectKBest(score_func=chi2, k=1)
# X_new = selector.fit_transform(X, y)

In [274]:
for i in X_encoded.columns:
    if X_encoded[i][0] == X_new[0][0]:
        print(i)
# X_new

Screen_Size


In [275]:
X_new

array([[156,   8, 218, ...,  22,   5,   0],
       [133,   8, 130, ...,  53,   5,   0],
       [156,   8, 260, ...,  89,   5,   0],
       ...,
       [156,   8, 198, ...,  53,   2,   4],
       [133,   8, 134, ...,  53,   5,   0],
       [156,   4, 229, ...,  24,   5,   0]])

In [287]:
model = RandomForestRegressor() 
model.fit(X, y)  
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]

In [306]:
indices

array([ 3,  5,  7,  2, 10, 12, 11,  4,  9,  6,  0,  1,  8, 14, 13])

In [307]:
ni = -1
for i in range(len(indices)):
    ni += 1
    if indices[i] == min(indices):
        most_eff_ind = ni

In [308]:
X_encoded.iloc[:,indices[most_eff_ind]]
        

0          156
1          133
2          156
3          133
4          156
          ... 
1017076    156
1017077    156
1017078    156
1017079    133
1017080    156
Name: Screen_Size, Length: 1017081, dtype: int64

In [292]:
indices[10]

0