In [2]:
import pandas as pd
import re
from sklearn.pipeline import  Pipeline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
df = pd.read_csv("../data/laptop_price.csv",encoding = "latin-1")
df.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,2537.45
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,1803.6


<h4>shape of dataframe</h4>

In [3]:
df.shape

(1303, 13)

<h3>Describe dataframe</h3>

In [4]:
df.describe()

Unnamed: 0,laptop_ID,Inches,Price_euros
count,1303.0,1303.0,1303.0
mean,660.155794,15.017191,1123.686992
std,381.172104,1.426304,699.009043
min,1.0,10.1,174.0
25%,331.5,14.0,599.0
50%,659.0,15.6,977.0
75%,990.5,15.6,1487.88
max,1320.0,18.4,6099.0


<h3>INFO</h3>

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


<h4>checking for null values</h4>

In [6]:
df.isnull().sum()

laptop_ID           0
Company             0
Product             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price_euros         0
dtype: int64

<h5>^ Dataframe have no nullvalues</h5>
<h3 style= "color: green;">  find categorical and numerical features</h3>

In [7]:
numerical_features = [col for col in df.columns if df[col].dtype != "object"]
categorical_features = [col for col in df.columns if df[col].dtype == "object"]
print("numerical_features :",numerical_features)
print("categorical_features :",categorical_features)

numerical_features : ['laptop_ID', 'Inches', 'Price_euros']
categorical_features : ['Company', 'Product', 'TypeName', 'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight']


<h3> Columns of dataframe</h3>

In [8]:
df.columns

Index(['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
       'ScreenResolution', 'Cpu', 'Ram', 'Memory', 'Gpu', 'OpSys', 'Weight',
       'Price_euros'],
      dtype='object')

<h3> Data cleaning </h3>
<h5 style= "color : #099456;"> ScreenResolution</h5>

In [9]:



def filter_string(string):

    pattern = r'(\d{4}x\d{3,4})'

    match = re.search(pattern, string)
    if match:
        resolution = match.group(1)
         

        return resolution

df['Resolution'] = df['ScreenResolution'].apply(filter_string)
df["Resolution"]
     

0       2560x1600
1        1440x900
2       1920x1080
3       2880x1800
4       2560x1600
          ...    
1298    1920x1080
1299    3200x1800
1300     1366x768
1301     1366x768
1302     1366x768
Name: Resolution, Length: 1303, dtype: object

In [10]:


def height_picker(string):
    pattern = r'(\d{3,4})x(\d{3,4})'

    match = re.match(pattern, string)
    if match:
        height = match.group(2)
        return int(height)
 
def width_picker(string):
    pattern = r'(\d{3,4})x(\d{3,4})'

    match = re.match(pattern, string)
    if match:
        width = match.group(1)
        return int(width)

df["height"] = df['Resolution'].apply(height_picker)
df["width"] = df["Resolution"].apply(width_picker)

<h5 style="color:#f98819;">Cpu</h5>

In [11]:
def cpu_clock_picker(string):
    string1 = string.split()[-1][:-3]
    return (float(string1))

df["Clock"] = df['Cpu'].apply(cpu_clock_picker)
 

<h3 style = "color:#359732;">RAM</h3>

In [12]:
def ram_picker(string):
    return int(string[:-2])

df["Memory"] = df['Ram'].apply(ram_picker)

<h4 style = "color : #119834;"> Weight </h4>

In [13]:
def weight_picker(string):
    return float(string[:-2])
df["Weight_in_kg"] = df["Weight"].apply(weight_picker)

<h1> Selecting coloumns</h1>

In [14]:
selected_columns = ["Company","Product","Inches","Memory","Gpu","OpSys","Price_euros","height","width","Clock","Weight_in_kg"]

sdf = df[selected_columns]
sdf

Unnamed: 0,Company,Product,Inches,Memory,Gpu,OpSys,Price_euros,height,width,Clock,Weight_in_kg
0,Apple,MacBook Pro,13.3,8,Intel Iris Plus Graphics 640,macOS,1339.69,1600,2560,2.3,1.37
1,Apple,Macbook Air,13.3,8,Intel HD Graphics 6000,macOS,898.94,900,1440,1.8,1.34
2,HP,250 G6,15.6,8,Intel HD Graphics 620,No OS,575.00,1080,1920,2.5,1.86
3,Apple,MacBook Pro,15.4,16,AMD Radeon Pro 455,macOS,2537.45,1800,2880,2.7,1.83
4,Apple,MacBook Pro,13.3,8,Intel Iris Plus Graphics 650,macOS,1803.60,1600,2560,3.1,1.37
...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,14.0,4,Intel HD Graphics 520,Windows 10,638.00,1080,1920,2.5,1.80
1299,Lenovo,Yoga 900-13ISK,13.3,16,Intel HD Graphics 520,Windows 10,1499.00,1800,3200,2.5,1.30
1300,Lenovo,IdeaPad 100S-14IBR,14.0,2,Intel HD Graphics,Windows 10,229.00,768,1366,1.6,1.50
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,15.6,6,AMD Radeon R5 M330,Windows 10,764.00,768,1366,2.5,2.19


<h4> Save dataframe</h4>

<h3>...............THe End.............</h3>

In [15]:
 sdf.to_csv("cleaned.csv",index = None)