In [1]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/laptop-price-dataset-april-2024/raw_ebay.csv
/kaggle/input/laptop-price-dataset-april-2024/cleaned.csv


In [2]:
!pip install wolta

Collecting wolta
  Downloading wolta-0.2.3-py3-none-any.whl.metadata (23 kB)
Collecting imblearn (from wolta)
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading wolta-0.2.3-py3-none-any.whl (17 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn, wolta
Successfully installed imblearn-0.0 wolta-0.2.3


# Data Init and First Look

In [3]:
df = pd.read_csv('/kaggle/input/laptop-price-dataset-april-2024/raw_ebay.csv')

In [4]:
df.head()

Unnamed: 0,Brand,Product_Description,Screen_Size,RAM,Processor,GPU,GPU_Type,Resolution,Condition,Price
0,Lenovo,Lenovo ThinkPad L15 Gen 2 15.6” FHD Laptop Cor...,15.6,16.0,Intel Core i5-1135G7,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080,Very Good - Refurbished,344.99
1,Lenovo,Lenovo ThinkPad 14” HD Laptop PC Computer Core...,14.0,16.0,Intel Core i5 7th Gen.,Intel HD Graphics 520,Integrated/On-Board Graphics,,Very Good - Refurbished,189.99
2,Lenovo,"Lenovo ThinkPad E14 14"" Laptop Core i5 11th Ge...",14.0,16.0,Intel Core i5-1135G7,Intel Iris Xe Graphics,Integrated/On-Board Graphics,1920 x 1080p,Good - Refurbished,289.99
3,Dell,"Dell Latitude 15.6"" Laptop Intel Core i5 64GB ...",,,Intel Core i5 8th Gen.,Intel UHD Graphics 620,Integrated/On-Board Graphics,1920 x 1080,Very Good - Refurbished,349.99
4,ASUS,"🔥NEW Asus Vivobook 15.6"" TouchScreen Laptop In...",15.6,16.0,Intel Core i7-1255U,Intel Iris Xe Graphics,,1920 x 1080,New,538.95


In [5]:
df.shape

(3981, 10)

In [6]:
from wolta.data_tools import col_types

types = col_types(df, print_columns=True)

Brand: str
Product_Description: str
Screen_Size: str
RAM: str
Processor: str
GPU: str
GPU_Type: str
Resolution: str
Condition: str
Price: float64


In [7]:
from wolta.data_tools import seek_null

seeked = seek_null(df, print_columns=True)

Brand has 32 null values
Product_Description has 1 null values
Screen_Size has 196 null values
RAM has 251 null values
Processor has 101 null values
GPU has 734 null values
GPU_Type has 723 null values
Resolution has 661 null values
Price has 1 null values


In [8]:
from wolta.data_tools import unique_amounts

unique_amounts(df)

{'Brand': 47,
 'Product_Description': 3981,
 'Screen_Size': 64,
 'RAM': 28,
 'Processor': 336,
 'GPU': 305,
 'GPU_Type': 20,
 'Resolution': 77,
 'Condition': 7,
 'Price': 1528}

# Dealing With Null Values

here is the list of the actions that we will take:

| feature | action |
| --- | --- |
| Brand | nan to unknown |
| Product_Description | nan to 0 |
| Screen Size | nan to mean |
| Ram | nan to unknown |
| Processor | nan to unknown |
| GPU | nan to unknown |
| GPU_Type | nan to unknown |
| Resolution | nan to unknown |
| Price | delete sample |

In [9]:
df['Brand'] = df['Brand'].replace({np.nan: 'unknown'})
df['RAM'] = df['RAM'].replace({np.nan: 'unknown'})
df['Processor'] = df['Processor'].replace({np.nan: 'unknown'})
df['GPU'] = df['GPU'].replace({np.nan: 'unknown'})
df['GPU_Type'] = df['GPU_Type'].replace({np.nan: 'unknown'})
df['Resolution'] = df['Resolution'].replace({np.nan: 'unknown'})

df['Product_Description'] = df['Product_Description'].replace({np.nan: 0})

in order to calculate and replace with mean value, we need to convert column from string to float

In [10]:
total = 0
count = 0

for i in range(df.shape[0]):
    if df['Screen_Size'].values[i] == 'Does' or df['Screen_Size'].values[i] == 'N\A' or df['Screen_Size'].values[i] == 'Not' or df['Screen_Size'].values[i] == 'Unknown':
        df['Screen_Size'].values[i] = np.nan
         
    elif str(type(df['Screen_Size'].values[i])).__contains__('str'):
        df['Screen_Size'].values[i] = float(df['Screen_Size'].values[i].replace('"', '').replace('in.', '').replace('in', '').replace('\'\'', '').replace('-ch', '').replace('-', '').replace('and', ''))
        
        total += df['Screen_Size'].values[i]
        count += 1

        
mean = total / count
print('total {}'.format(str(total)))
print('count {}'.format(str(count)))
print('mean {}'.format(str(mean)))

total 54951.54999999927
count 3763
mean 14.603122508636531


In [11]:
df['Screen_Size'] = df['Screen_Size'].replace({np.nan: mean})

  df['Screen_Size'] = df['Screen_Size'].replace({np.nan: mean})


In [12]:
tf = df['Price'].isna()
that = -1

for i in range(len(tf)):
    if tf[i] == True:
        that = i
        print(i)

3412


In [13]:
df = df.drop(that)

In [14]:
from wolta.data_tools import seek_null

seeked = seek_null(df, print_columns=True)

# Data Manipulation

| feature | action |
| --- | --- | 
| Brand | make categorical |
| Product_Description | calculate length of strings |
| RAM | make categorical |
| Processor | make categorical |
| GPU | make categorical |
| GPU_Type | make categorical |
| Resolution | make categorical |
| Condition | make categorical |

In [15]:
for i in range(df.shape[0]):
    if str(type(df['Product_Description'].values[i])).__contains__('str'):
        df['Product_Description'].values[i] = len(df['Product_Description'].values[i])

In [16]:
print(df['Brand'].unique())

['Lenovo' 'Dell' 'ASUS' 'HP' 'Acer' 'Microsoft' 'Razer' 'MSI' 'Apple'
 'Samsung' 'Panasonic' 'LG' 'Geo' 'unknown' 'DELL' 'LENOVO' 'Gateway'
 'LG Electronics' 'Huawei' 'Getac' 'MICROSOFT' 'Google' 'Dell Inc.' 'Asus'
 'ThinkPad' 'acer' 'Chuwi' 'Sony' 'Unbranded' 'VAIO' 'ByteSpeed'
 'Dell gaming games game' 'Eurocom' 'Sager' 'GIGABYTE' 'Alienware' 'AVITA'
 'Hewlett Packard' 'Intel' 'Dell Latitude' 'HP Commercial Remarketing'
 'Dell Commercial' 'Ruggon' 'Lenovo Idea' 'AORUS' 'Microsoft Surface'
 'SAMSUNG']


In [17]:
df['Brand'] = df['Brand'].replace({
    'Lenovo': 'lenovo',
    'Dell': 'dell',
    'ASUS': 'asus',
    'HP': 'hp',
    'Acer': 'acer',
    'Microsoft': 'microsoft',
    'Razer': 'razer',
    'MSI': 'msi',
    'Apple': 'apple',
    'Samsung': 'samsung',
    'Panasonic': 'panasonic',
    'LG': 'lg',
    'Geo': 'geo',
    'DELL': 'dell',
    'LENOVO': 'lenovo',
    'Gateway': 'gateway',
    'LG Electronics': 'lg',
    'Huawei': 'huawei',
    'Getac': 'getac',
    'MICROSOFT': 'microsoft',
    'Google': 'google',
    'Dell Inc.': 'dell',
    'Asus': 'asus',
    'ThinkPad': 'thinkpad',
    'Chuwi': 'chuwi',
    'Sony': 'sony',
    'Unbranded': 'unknown',
    'VAIO': 'vaio',
    'ByteSpeed': 'bytespeed',
    'Dell gaming games game': 'dell',
    'Eurocom': 'eurocom',
    'Sager': 'sager',
    'GIGABYTE': 'gigabyte',
    'Alienware': 'alienware',
    'AVITA': 'avita',
    'Dell Latitude': 'dell',
    'HP Commercial Remarketing': 'hp',
    'Dell Commercial': 'dell',
    'Lenovo Idea': 'lenovo',
    'Microsoft Surface': 'microsoft',
    'SAMSUNG': 'samsung'
})

In [18]:
print(df['Brand'].unique())

['lenovo' 'dell' 'asus' 'hp' 'acer' 'microsoft' 'razer' 'msi' 'apple'
 'samsung' 'panasonic' 'lg' 'geo' 'unknown' 'gateway' 'huawei' 'getac'
 'google' 'thinkpad' 'chuwi' 'sony' 'vaio' 'bytespeed' 'eurocom' 'sager'
 'gigabyte' 'alienware' 'avita' 'Hewlett Packard' 'Intel' 'Ruggon' 'AORUS']


In [19]:
print(df['RAM'].unique())

['16' 'unknown' 'Up' '8' '32' '16gb' '8gb' '16GB' '24' '4' '8GB' '4GB'
 '40' '64' '128' '32gb' '32GB' '12' '4GB,' 'up' '512' '8GB,' '64gb' '2050'
 'upto' '20' '256' '16GB,']


In [20]:
df['RAM'] = df['RAM'].replace({
    'Up': 'unknown',
    '16gb': '16',
    '8gb': '8',
    '16GB': '16',
    '8GB': '8',
    '4GB': '4',
    '32gb': '32',
    '32GB': '32',
    '4GB': '4',
    'up': 'unknown',
    '8GB': '8',
    '64gb': '64',
    'upto': 'unknown',
    '16GB,': '16',
    '4GB,': '4',
    '8GB,': '8'
})

In [21]:
print(df['RAM'].unique())

['16' 'unknown' '8' '32' '24' '4' '40' '64' '128' '12' '512' '2050' '20'
 '256']


In [22]:
from wolta.data_tools import make_numerics

df['Processor'] = make_numerics(df['Processor'])
df['GPU'] = make_numerics(df['GPU'])
df['Resolution'] = make_numerics(df['Resolution'])
df['Condition'] = make_numerics(df['Condition'])
df['GPU_Type'] = make_numerics(df['GPU_Type'])
df['RAM'] = make_numerics(df['RAM'])
df['Brand'] = make_numerics(df['Brand'])

In [23]:
df.head()

Unnamed: 0,Brand,Product_Description,Screen_Size,RAM,Processor,GPU,GPU_Type,Resolution,Condition,Price
0,0,80,15.6,0,0,0,0,0,0,344.99
1,0,79,14.0,0,1,1,0,1,0,189.99
2,0,77,14.0,0,0,0,0,2,1,289.99
3,1,78,14.603123,1,2,2,0,0,0,349.99
4,2,77,15.6,0,3,0,1,0,2,538.95


# Last Analysis

In [24]:
df.describe()

Unnamed: 0,Brand,Screen_Size,RAM,Processor,GPU,GPU_Type,Resolution,Condition,Price
count,3980.0,3980.0,3980.0,3980.0,3980.0,3980.0,3980.0,3980.0,3980.0
mean,2.260804,14.603123,1.454271,46.161558,36.027638,0.535176,4.29196,2.13593,521.743198
std,2.982958,1.247801,1.600191,72.258293,68.707357,1.177838,9.541075,1.270963,241.283545
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,14.0,0.0,7.0,0.0,0.0,0.0,1.0,324.5625
50%,1.0,14.0,2.0,16.0,3.0,0.0,0.0,2.0,498.85
75%,3.0,15.6,2.0,38.0,22.0,1.0,4.0,3.0,700.0
max,31.0,39.6,13.0,335.0,304.0,19.0,76.0,5.0,999.99


In [25]:
from wolta.data_tools import stat_sum

stat_sum(df,
        ['max', 'min', 'width', 'med', 'var'])

Brand
max: 31
min: 0
width: 31
median: 1.0
variance: 8.895800358576803
***
Product_Description
max: 80
min: 15
width: 65
median: 78.0
variance: 45.96316658670244
***
Screen_Size
max: 39.6
min: 0.0
width: 39.6
median: 14.0
variance: 1.5566157815385722
***
RAM
max: 13
min: 0
width: 13
median: 2.0
variance: 2.5599691926971544
***
Processor
max: 335
min: 0
width: 335
median: 16.0
variance: 5219.949024708972
***
GPU
max: 304
min: 0
width: 304
median: 3.0
variance: 4719.514814019847
***
GPU_Type
max: 19
min: 0
width: 19
median: 0.0
variance: 1.3869536122825181
***
Resolution
max: 76
min: 0
width: 76
median: 0.0
variance: 91.00923183757985
***
Condition
max: 5
min: 0
width: 5
median: 2.0
variance: 1.6149402161561575
***
Price
max: 999.99
min: 0.0
width: 999.99
median: 498.85
variance: 58203.121517030944
***


In [28]:
# Save the Pandas DataFrame to a CSV file
output_path = "/kaggle/working/preprocessed_laptop_data.csv"

# Use the to_csv method to write the DataFrame to a CSV file
df.to_csv(output_path, index=False)

print(f"Preprocessed data saved to {output_path}")

Preprocessed data saved to /kaggle/working/preprocessed_laptop_data.csv
