In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('dataset/laptop.csv')

In [3]:
df.sample(3)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
806,806,Toshiba,Notebook,15.6,1366x768,Intel Core i5 6200U 2.3GHz,4GB,500GB HDD,Intel HD Graphics 520,Windows 10,2.1kg,41558.4
8,8,Asus,Ultrabook,14.0,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,16GB,512GB SSD,Nvidia GeForce MX150,Windows 10,1.3kg,79653.6
536,536,Dell,Notebook,15.6,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,AMD Radeon 530,Windows 10,2.2kg,42486.0048


In [4]:
df.columns

Index(['Unnamed: 0', 'Company', 'TypeName', 'Inches', 'ScreenResolution',
       'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight', 'Price'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


# Data Preprocessing

### 1. Removing column 'Unnamed: 0'.

In [6]:
df.drop(columns='Unnamed: 0', inplace=True,)

In [7]:
df.sample(2)

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1040,Lenovo,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.8kg,45128.16
97,Dell,Notebook,15.6,Full HD 1920x1080,Intel Core i3 6006U 2GHz,4GB,256GB SSD,AMD Radeon R5 M430,Linux,2.2kg,25840.8


#### Checking `"null values"` and `"duplicate values"`

In [8]:
def count_data(data):
    data_column = pd.DataFrame(
        {
            'Count Missing': data.isnull().sum(),
            'Percent Missing': data.isnull().sum()*100/data.shape[0],
            'Count Duplicate Rows': data.duplicated().sum(),
            'Percent Duplicate Rows': data.duplicated().sum()*100/data.shape[0]
            
        }
    )
    return data_column

In [9]:
count_data(df)

Unnamed: 0,Count Missing,Percent Missing,Count Duplicate Rows,Percent Duplicate Rows
Company,0,0.0,29,2.225633
TypeName,0,0.0,29,2.225633
Inches,0,0.0,29,2.225633
ScreenResolution,0,0.0,29,2.225633
Cpu,0,0.0,29,2.225633
Ram,0,0.0,29,2.225633
Memory,0,0.0,29,2.225633
Gpu,0,0.0,29,2.225633
OpSys,0,0.0,29,2.225633
Weight,0,0.0,29,2.225633


There are 29 duplicate rows

In [10]:
count_non_duplicate_rows = df.duplicated(keep=False).sum() - df.duplicated().sum()
duplicate_rows = df.duplicated().sum()
all_rows = df.duplicated(keep=False).sum()

print(f"Out of {all_rows} rows, {count_non_duplicate_rows} are original and {duplicate_rows} are duplicate rows")

Out of 43 rows, 14 are original and 29 are duplicate rows


### Removing duplicate rows

In [14]:
df.drop_duplicates(inplace=True)
print("Duplicate Rows: ", df.duplicated().sum())
df.shape

Duplicate Rows:  0


(1274, 11)

In [16]:
count_data(df)

Unnamed: 0,Count Missing,Percent Missing,Count Duplicate Rows,Percent Duplicate Rows
Company,0,0.0,0,0.0
TypeName,0,0.0,0,0.0
Inches,0,0.0,0,0.0
ScreenResolution,0,0.0,0,0.0
Cpu,0,0.0,0,0.0
Ram,0,0.0,0,0.0
Memory,0,0.0,0,0.0
Gpu,0,0.0,0,0.0
OpSys,0,0.0,0,0.0
Weight,0,0.0,0,0.0


- Unnamed: 0' column is removed.
- found 29 duplicated rows and successfully removed.
- Now there are no more 'duplicate rows' and 'null values' in the dataset.