In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Laptop price initial data cleaning

In [None]:
df = pd.read_csv('../input/laptop-price/laptop_price.csv', encoding='latin-1')
df #examine

In [None]:
#do the easiest one first
#drop the GB from ram, and change it to numeric value
df['Ram'] = df['Ram'].replace('[GB]', '', regex=True)
df['Ram'] = pd.to_numeric(df['Ram'])
df.rename(columns={'Ram':'Ram (GB)'}, inplace=True)

#now Weight
#remove the 'KG' and change to numeric
df['Weight'] = df['Weight'].replace('[kg]', '', regex=True)
df['Weight'] = pd.to_numeric(df['Weight'])
df.rename(columns={'Weight':'Weight (Kg)'}, inplace=True)

#Now for the Memory (the storage)
#it contains the memory size in either GB or TB
#Followed by the storage Type, either SSD, HDD, or Flash Storage
#so we need to split the memory by size and the type first
#assign size in the memory column (replace the original) and type in a new column called storage type
#then drop the size abbreviation in memory column and change it to numeric

#first split at fist space to get size and type
split_mem = df['Memory'].str.split(' ', 1, expand=True)
split_mem
#now assign the new columns to original DF
df['Storage Type'] = split_mem[1]
df['Memory'] = split_mem[0]


#now change the memory into numeric 
#okay since the size is either in GB or TB, we can drop the abbrev for now
#and later we perhaps want to convert the TB into GB (1TB into 1024GB)
#since its either GB or TB we cant replace it with empty space, we need to do strsplit

#put the split into temporary df
df_mem= df['Memory'].str.split('(\d+)',  expand=True)

#assign it back
df['Memory'] = pd.to_numeric(df_mem[1])
df.rename(columns={'Memory':'Memory (GB or TB)'}, inplace=True)


#last thing, convert the TB into GB
#lets check the disticnt value
df['Memory (GB or TB)'].unique()
#the result [128, 256, 512, 500,   1,  32,  64,   2,  16, 180, 240,   8, 508]

#okay we we now if its more than 1 digit number, it must be in GB otherwise its in TB
#but there is number 8 here, could it be 8GB storage or 8tb storage?
#lets check first

df[df['Memory (GB or TB)'] == 8]
#it is 8GB SSD, so 16 must be in GB aswell

#now we only need to change 1 and 2 into 1024 and 2048 respectively
def mem(x):
    if x == 1:
        return 1024
    elif x == 2:
        return 2048
df['Memory (GB or TB)'] = df['Memory (GB or TB)'].apply(lambda x: 1024 if x==1 else x)
df['Memory (GB or TB)'] = df['Memory (GB or TB)'].apply(lambda x: 2048 if x==2 else x)
df.rename(columns={'Memory (GB or TB)':'Memory (GB)'}, inplace=True)

#now lets examine the data for now
df

In [None]:
#Now for CPU
#there are 3 information in the string
#for example Intel Core i7 6500U 2.5GHz or Intel Celeron Dual Core N3050 1.6GHz
# first part is the vendor either intel or amd
#the second part is the model (e.g Core i7 6500u, Celeron Dual Core N3050)
#The third part is the CPU speed in Ghz (2.5Ghz, 1.6Ghz)

#lets do this one by one to not get confuse
#first thing first, lets split by first space then put the cpu vendor into new column
split_vendor = df['Cpu'].str.split(' ', 1, expand=True)
split_vendor
#now assign the new columns to original DF
df['Cpu Vendor'] = split_vendor[0]
df['Cpu'] = split_vendor[1]


#next Lets take out the cpu speed

split_type= df['Cpu'].str.split(' ', 1, expand=True)
split_type
#now assign the new columns to original DF
df['Cpu Type'] = split_type[0]
df['Cpu'] = split_type[1]

#now we are going to take the cpu speed, this time we dont do expand=True
#regular split will just return list instead of new dataframe without expand=True
#but if we want to take the value in list inside of the data frame we need to use str.get() instead of [] 
df['Cpu'] = df['Cpu'].str.split()
df['Cpu Speed'] = df['Cpu'].str.get(-1)

temp = list(df['Cpu'])
temp_2 = []
for i in temp:
    del i[-1]
    temp_2.append(i)
temp_3 = []
for i in temp_2:
    temp_3.append(" ".join(i))
df['Cpu'] = temp_3


#now lets split Cpu speed
df['Cpu Speed'] = df['Cpu Speed'].replace('[GHz]', '', regex=True)
df['Cpu Speed'] = pd.to_numeric(df['Cpu Speed'])
df.rename(columns={'Cpu Speed':'Cpu Speed (GHz)'}, inplace=True)

#examine again
df

In [None]:
#separating CPU model and series, e.g i5 7200u, Series i5, model 7200u. if no series, write unspecified
#we dont use expand because cpu without series only has length of 1, and also there is dual core with space

#Ryzen is abit different
#if ryzen 1600 means ryzen 5 1600, if 1700, means ryzen 7 1700
#only ryzen 5 and 7 are available, no ryzen 3

cpu_split = list(df['Cpu'].str.split())
#df['Cpu Model'] = df['Cpu'].str.get(-1)
cpu_model_true = []
for i in cpu_split:
    if len(i) == 1 and i[0] not in ['1600', '1700']:
        cpu_model_true.append('Unspecified')
    elif len(i) == 1 and i[0] in ['1600', '1700']:
        if i[0] == '1600':
            cpu_model_true.append('1600')
        elif i[0] == '1700':
            cpu_model_true.append('1700')
    #elif len(i) > 1 and '-' in i[1]: #APU check
        #cpu_series.append([i[1].split('-')[-1]]) #add apu series code
    else:
        cpu_model_true.append(i[-1])
df['Cpu Model'] = cpu_model_true



#now for CPU series
#delete model to get cpu series from cpu_split list

#heads up, for ryzen dont have any series, we are going to add the ryzen series
#if ryzen 1600 means ryzen 5 1600, if 1700, means ryzen 7 1700
#only ryzen 5 and 7 are available, no ryzen 3

#and also for APU series, some series are writen A9-9420
#eventhough 9420 is the model, we have to split this by '-'

cpu_series = []
for i in cpu_split:
    if len(i) == 1 and i[0] not in ['1600', '1700']: #and '-' not in i[0]: #ryzen and apu check
        cpu_series.append(i)
    elif len(i) == 1 and i[0] in ['1600', '1700']: #ryzen check
        if i[0] == '1600':
            cpu_series.append(['5']) #ryzen add
        elif i[0] == '1700':
            cpu_series.append(['7']) #ryzen add
    #elif len(i) == 1 and '-' in i[0]: #APU check
        #cpu_series.append([i[0].split('-')[0]]) #add apu series code
    else:
        del i[-1]
        cpu_series.append(i)
    
cpu_series_join = []
for i in cpu_series:
    cpu_series_join.append(" ".join(i))
df['Cpu Series'] = cpu_series_join



#rearrange column and delete cpu column
#drop cpu column
df = df.drop('Cpu', 1)
#rearrange
df = df[['laptop_ID',
 'Company',
 'Product',
 'TypeName',
 'Inches',
 'ScreenResolution',
 'Cpu Vendor',
 'Cpu Type',
 'Cpu Series',        
 'Cpu Model',        
 'Cpu Speed (GHz)',
 'Ram (GB)',
 'Storage Type',        
 'Memory (GB)',
 'Gpu',
 'OpSys',
 'Weight (Kg)',        
 'Price_euros']]
df

In [None]:
#for the time being, we are done with CPU
#Move to:
#Screen Resoultion column needs to be split between reso, and panel
#if they dont have panel type, e.g only 'full hd' in description then panel is 'unspecified'

#GPu needs to be split by gpu vendor, model, series, and so on

[list(df['ScreenResolution'].unique())] #check distinct values in screenreso

temp_reso= df['ScreenResolution'].str.split(" ") #temporary series for screen reso
df['Reso'] = temp_reso.str.get(-1) #getting screen reso only

#now lets split RESO into Screen Width and Screen Height
df[['Screen Width', 'Screen Height']] = df['Reso'].str.split('x', expand=True) #split and put them into respective column instantly

#create screen type, its either ips or not 'unspecified'
df['Screen Type'] = df['ScreenResolution'].apply(lambda x : 'IPS' if 'IPS' in x else 'Unspecified')

#this is where i am confused
#should i create new features called Screen Quality for (HD, FUll HD, Quad HD, and ULTRA HD)
#but there is some screen reso that i dont understand which category should i fall to
#like example 1920x1080 is obviusly full hd, but 2736x1824 which category should it fall to?
#even https://en.wikipedia.org/wiki/Display_resolution doesnt have explaantion for it
#i think i'm done for screenresolution for the time being
#i made features : Reso, Screen Width, Screen Height, and Screen Type

#SO lets move to GPU for the time being

#create GPU vendor
temp_df001 = df['Gpu'].str.split()
df['Gpu Vendor'] = temp_df001.str.get(0)

#create Gpu Model
df_temp002 = list(df['Gpu'].str.split())
df_temp002

df_temp003 = []
for i in df_temp002:
    df_temp003.append(" ".join(i[1:]))
df_temp003

df['Gpu Model'] = df_temp003

#rearrange
df = df[['laptop_ID', 'Company', 'Product', 'TypeName', 'Inches',
    'ScreenResolution', 'Reso', 'Screen Width', 'Screen Height', 'Screen Type',
    'Cpu Vendor', 'Cpu Type', 'Cpu Series', 'Cpu Model',
    'Cpu Speed (GHz)', 'Ram (GB)', 'Storage Type', 'Memory (GB)', 'Gpu', 'Gpu Vendor', 'Gpu Model',
    'OpSys', 'Weight (Kg)',   'Price_euros']]
df

In [None]:
#okay done for the time being with datacleaning
#i should  add this to kaggle and ask for advice for the problem i encounter
#1. CPU AMD still needs cleaning but i confuse as how to
#2. GPU MOdel still needs to be separated 
#3. Do i need to add Screen Type? like HD, FHD and soon. but there is still screeb reso that i
#do not know which its belong to. like 2736x1824 which category should it fall to? i had check wiki but no answer