In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
data = pd.read_csv("/kaggle/input/laptop-price/laptop_price.csv", encoding= 'unicode_escape')
data.head()

The dataset used for this notebook is the "Laptop Price" dataset on Kaggle, 
which can be found at the following link 
https://www.kaggle.com/muhammetvarl/laptop-pricehttps://www.kaggle.com/muhammetvarl/laptop-price.

At the above link, there is also a description of the variables, which will be repeated below.

1 Company- String -Laptop Manufacturer

2 Product -String -Brand and Model

3 TypeName -String -Type (Notebook, Ultrabook, Gaming, etc.)

4 Inches -Numeric- Screen Size

5 ScreenResolution -String- Screen Resolution

6 Cpu- String -Central Processing Unit (CPU)

7 Ram -String- Laptop RAM

8 Memory -String- Hard Disk / SSD Memory

9 GPU -String- Graphics Processing Units (GPU)

10 OpSys -String- Operating System

11 Weight -String- Laptop Weight

12 Price_euros -Numeric- Price (Euro)

## **Goal**:  Predict the price of a laptop from the other variables.

Note that this is a regression problem, where the target variable is Price_euros.

# Hypothesis Generation

Here we will try to generate some hypotheses about what could affect the price of a laptop.

1. Laptops made by well-known companies like Apple and Microsoft might be more expensive.

2. Laptops with a bigger screen size might be more expensive.

3. Laptops with a high screen resolution might be more expensive.

4. Laptops with a high CPU might be more expensive.

5. Laptops with a higher RAM might be more expensive.

6. Laptops with a large memory might be more expensive.

7. Laptops with a high GPU might be more expensive.

8. Laptops without an operating sysytem might be cheaper.

9. Lighter Laptops might be more expensive than heavier laptops
   because they could be newer with more advanced technology.

In [3]:
# Here we will import the required packages.

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Quick Analysis

In [4]:
data.shape

In [5]:
data.dtypes

In [6]:
# First, we will check if there are any duplicate laptop Id's.

idsUnique = len(set(data['laptop_ID']))
idsTotal = data.shape[0]
idsdupe = idsTotal - idsUnique
print(idsdupe)

In [7]:
# Since there are no duplicates, we will drop the laptop_ID column.

data.drop(['laptop_ID'], axis=1, inplace=True)

In [8]:
data.shape

In [9]:
data.head()

# Step 2: Split the data into training and test sets.

In [10]:
# We will put 80% of the data in the training set and the remaining 20% in the test set.

train, test = train_test_split(data, train_size=0.8,random_state=0)

In [11]:
print("Shape of training set:", train.shape)
print("Shape of test set:", test.shape)

In [12]:
train.head()

In [13]:
test.head()

# Step 3: Data Exploration

In [14]:
train.head(10)

In [15]:
train.tail(10)

In [16]:
test.head(10)

In [17]:
test.tail(10)

In [18]:
train.dtypes

In [19]:
test.dtypes

In [20]:
train.info()

In [21]:
test.info()

In [22]:
train.describe()

In [23]:
test.describe()

Let us look at some descriptive statistics for the remaining categorical variables.


In [24]:
train.describe(include=[np.object])

In [25]:
test.describe(include=[np.object])

After looking at the descriptive statistics for Weight, 
it seems Weight should be a numeric variable instead.
First, we need to check if each value of weight is 
of the form "some value"kg.

In [26]:
train['Weight'].str.endswith('kg').sum()

In [27]:
test['Weight'].str.endswith('kg').sum()

Thus, each value of weight is of the form "some value"kg.

In [28]:
train.rename(columns={'Weight': 'Weight (kg)'}, inplace=True)
train.head()

In [29]:
train['Weight (kg)'] = train['Weight (kg)'].str[:-2].astype(float)
train['Weight (kg)']

In [30]:
test.rename(columns={'Weight': 'Weight (kg)'}, inplace=True)
test.head()

In [31]:
test['Weight (kg)'] = test['Weight (kg)'].str[:-2].astype(float)
test['Weight (kg)']

In [32]:
train.dtypes

In [33]:
test.dtypes

In [34]:
train.info()

In [35]:
test.info()

In [36]:
train.describe()

In [37]:
test.describe()

In [38]:
train.describe(include=[np.object])

In [39]:
test.describe(include=[np.object])

First, we will look at the numerical variables.  
We will use the value_counts method to get more information about each variable.
Then we will look at distribution plots of each variable in the training set.

In [40]:
num_feats = train.select_dtypes(include='number').columns.to_list()
num_feats

In [41]:
train['Inches'].value_counts()

In [42]:
test['Inches'].value_counts()

In [43]:
sns.distplot(x=train['Inches'])
plt.xlabel('Inches')
plt.show()

In [44]:
train['Weight (kg)'].value_counts()

In [45]:
test['Weight (kg)'].value_counts()

In [46]:
sns.distplot(x=train['Weight (kg)'])
plt.xlabel('Weight (kg)')
plt.show()

In [47]:
train['Price_euros'].value_counts()

In [48]:
test['Price_euros'].value_counts()

In [49]:
sns.distplot(x=train['Price_euros'])
plt.xlabel('Price_euros')
plt.show()

Now, we will look at the categorical variables.  
We will use the value_counts method to get more 
information about each variable.  For each categorical 
variable, we will also check that the set of values 
in the test set is a subset of or equal to the set 
of values in the training set.  This is to make sure 
that we are careful when we create dummy variables later on. 


In [50]:
train['Company'].value_counts()

In [51]:
test['Company'].value_counts()

Note that for the Company variable, the set of values
in the training set and the set of values in the test set 
are not the same because there are 19 unique values in the 
training set and 16 unique values in the test set.  We will 
check that the set of values in the test set is a subset of 
the set of values in the training set.

In [52]:
set(test['Company']) <= set(train['Company'])

In [53]:
train['Product'].value_counts()

In [54]:
test['Product'].value_counts()

Note that for the Product variable, the set of values
in the training set and the set of values in the test set 
are not the same because there are 523 unique values in the 
training set and 193 unique values in the test set.  We will 
check that the set of values in the test set is a subset of 
the set of values in the training set.

In [55]:
set(test['Product']) <= set(train['Product'])

This means that for the Product variable, there are values in the test set 
that are not in the training set.

In [56]:
train['TypeName'].value_counts()

In [57]:
test['TypeName'].value_counts()

In [59]:
set(test['TypeName']) == set(train['TypeName'])

For the TypeName variable, the set of values in the test set 
is equal to the set of values in the training set.

In [60]:
train['ScreenResolution'].value_counts()

In [61]:
test['ScreenResolution'].value_counts()

Note that for the ScreenResolution variable, the set of values
in the training set and the set of values in the test set 
are not the same because there are 40 unique values in the 
training set and 24 unique values in the test set.  We will 
check that the set of values in the test set is a subset of 
the set of values in the training set.

In [62]:
set(test['ScreenResolution']) <= set(train['ScreenResolution'])

In [None]:
train['Cpu'].value_counts()

In [None]:
test['Cpu'].value_counts()

In [None]:
train['Ram'].value_counts()

In [None]:
test['Ram'].value_counts()

In [None]:
train['Memory'].value_counts()

In [None]:
test['Memory'].value_counts()

In [None]:
train['Gpu'].value_counts()

In [None]:
test['Gpu'].value_counts()

In [None]:
train['OpSys'].value_counts()

In [None]:
test['OpSys'].value_counts()

For each categorical variable, we will check that the set of values 
in the test set is a subset of or equal to the set of values in the training set.  
This is to make sure that we are careful when we create dummy variables later on.

# Step 4: Outlier Detection

In [None]:
sns.boxplot(x=train['Price_euros'])

In [None]:
sns.boxplot(x=train['Inches'])