In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#dataset reading
df = pd.read_csv('/kaggle/input/laptop-price-dataset/laptop.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

## Feature engineering 

In [None]:
# deleting an unnecessary column
df = df.drop('Unnamed: 0', axis =1)

In [None]:
# checking for duplicates
df.duplicated().sum()

In [None]:
# Create a new column "Brand"
df['Brand'] = df['Model'].apply(lambda x: x.split()[0])

In [None]:
# Chang the colum view
df['Price'] = df['Price'].apply(lambda x: x[1:].replace(',', '')).astype(int)
df.head(5)

In [None]:
# Converting to usd 
exchange_rate = 88 # на сегодня
df['Price'] = np.round(df['Price']/exchange_rate, 1)

In [None]:
#  put the numerical value of RAM in a separate column
df['Ram_numeric'] = df['Ram'].apply(lambda x: x.split()[0])
df['Ram_value'] = df['Ram'].apply(lambda x: x.split()[1])
df.head()

In [None]:
# find exceptions
df[df['Ram_numeric']=='Storage:']

In [None]:
df = df.drop(109)

In [None]:
df['Ram_numeric'] = df['Ram_numeric'].astype(int)

In [None]:
df['Warranty'].value_counts()

In [None]:
# Deleting rows with uncorrect data
df = df[df['OS'] != 'Backlit Keyboard']
df = df[df['OS'] != '15.6\u2009inches, 1920\u2009x\u20091080\u2009pixels']
df = df[df['OS'] != '1 Year Warranty']
df = df[df['OS'] != '2 Year Warranty']
df = df[df['OS'] != 'Intel Iris Xe Graphics']
df = df[df['Ram'] != '64 GB Hard Disk']
df = df[df['Warranty'] != 'Backlit Keyboard']
df = df[df['Warranty'] != '1 USB 3.0 Ports']

In [None]:
# Changing the column Warranty view
df['Warranty'] = df['Warranty'].apply(lambda x: x.split()[0])
df.rename(columns={'Warranty': 'Warranty_years'}, inplace=True)

df['Warranty_years'] = df['Warranty_years'].astype(object)

In [None]:
# Split the column Display data
df['Diagonal'] = df['Display'].apply(lambda x: x.split()[0]).astype(float)
df.head()

In [None]:
# Split the column SSD data
df['SSD_value'] = df['SSD'].apply(lambda x: x.split()[0]).astype(int)

In [None]:
# Converting SSD view
mask = (df['SSD_value'] == 1) | (df['SSD_value'] == 2) | (df['SSD_value'] == 4)
df.loc[mask, 'SSD_value'] = df.loc[mask, 'SSD_value'] * 1024

In [None]:
#df['Core'].value_counts()

# Parsing the processor into 2 columns with the number of cores and threads.
def parse_cpu(text):
  if pd.isna(text):
    return pd.Series([np.nan, np.nan])
  cores_match = re.search(r'(\d+)\s*Cores?', text)
  if not cores_match:
    cores_match = re.search(r'(Dual|Quad|Hexa|Octa)\s*Core', text, re.I)
    mapping = {'Dual': 2, 'Quad': 4, 'Hexa': 6, 'Octa': 8}
    cores = mapping.get(cores_match.group(1), np.nan) if cores_match else np.nan
  else:
    cores = int(cores_match.group(1))
  # ищем число перед threads
  threads_match = re.search(r'(\d+)\s*Threads?', text)
  threads = int(threads_match.group(1)) if threads_match else np.nan
  return pd.Series([cores, threads])

In [None]:
df[['CPU_cores', 'CPU_threads']] = df['Core'].apply(parse_cpu)
df['CPU_threads'].isnull().sum()

## EDA (Exploratory Data Analysis)

In [None]:
df.describe()

In [None]:
# Price distribution
plt.figure(figsize=(8, 6))
sns.set_palette("Paired")
sns.histplot(df['Price'], kde = True, color = 'green')
plt.title('Распределение цен на ноутбуки')
plt.xlabel('Цена ($)')
plt.ylabel('Частота')
plt.show()

In [None]:
print(df['Price'].mode()[0])
print(df['Price'].mean())
print(df['Price'].median())

In [None]:
# Top 10 brands by their number
top_brands = df['Brand'].value_counts().head(10)
top_brands

In [None]:
# Mean price and number of brands
brands_counts = df['Brand'].value_counts().reset_index()

avg_prices = df.groupby('Brand')['Price'].mean().round(1).reset_index()

top_brands_avg_price = pd.merge(brands_counts, avg_prices, on='Brand').head(10)
top_brands_avg_price

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=top_brands_avg_price, x="Brand", y="Price", palette="tab10")
plt.title("Average price for top 10 brands", fontsize=14)
plt.xlabel("Brand")
plt.ylabel("Mean Price ($)")
plt.xticks(rotation=45)
plt.show()

In [None]:
top_brands = df[df['Brand'].isin(top_brands_avg_price['Brand'])]

plt.figure(figsize=(12, 6))
sns.boxplot(data=top_brands, x="Brand", y="Price", palette="tab10")
plt.title("Price distribution of the top 10 brands", fontsize=14)
plt.xlabel("Brand")
plt.ylabel("Average Price ($)")
plt.xticks(rotation=45)
plt.show()

In [None]:
# top 10 brand by ranting
rating = df.groupby('Brand')['Rating'].mean().reset_index().sort_values(by='Rating', ascending=False)
rating.head(10)

In [None]:
avg_rating = df.groupby('Brand')['Rating'].mean().round(1).reset_index()

top_brands_avg_rating = pd.merge(brands_counts, avg_rating, on='Brand').head(10)
top_brands_avg_rating

In [None]:
# Average rating by brand
plt.figure(figsize=(8,6))
plt.title('Average rating by brand')
sns.barplot(data=rating, x='Rating', y='Brand', color='violet')
for index, value in enumerate(rating['Rating'].round(2)):
    plt.text(value + 0.1, index, str(value), va='center')
plt.xlabel('Rating')
plt.ylabel('Brand')
plt.show()

In [None]:
# Price distribution by top operating systems
top_os = df['OS'].value_counts().head(5).index
filtered_df = df[df['OS'].isin(top_os)]
plt.figure(figsize=(8,6))
plt.title('Price distribution by top operating systems')
sns.boxplot(x='OS', y='Price', data=filtered_df, color='violet')
plt.xlabel('OS')
plt.ylabel('Price ($)')
plt.xticks(rotation=30)
plt.show()

In [None]:
os = df['OS'].value_counts().reset_index().sort_values(by='count', ascending=False).head(6)
plt.figure(figsize=(8,6))
plt.title('Distribution of models by OS')
sns.barplot(data=os, x='OS', y='count', color='violet')
for index, value in enumerate(os['count']):
    plt.text(index, value + 10, str(value), ha='center')
plt.xticks(rotation=30)
plt.show()

In [None]:
# Top 10 CPUs by average laptop rating
top_cpu = df.groupby('Generation')['Rating'].mean().round(1).sort_values(ascending=False).head(10).reset_index()
plt.figure(figsize=(8,6))
plt.title('Top 10 CPUs by average laptop rating')
sns.barplot(data=top_cpu, x='Generation', y='Rating', color='violet')
plt.xlabel('CPU')
plt.ylabel('Rating')
plt.xticks(rotation=45)
for index, value in enumerate(top_cpu['Rating']):
    plt.text(index, value + 0.5, str(value), ha='center')
plt.show()

In [None]:
# Dependence of price on RAM
avg_ram = df.groupby('Ram')['Price'].mean().round(0).sort_values(ascending=False).reset_index()
plt.figure(figsize=(18,6))
plt.title('Dependence of price on RAM')
sns.barplot(data=avg_ram, x='Ram', y='Price', color='violet')
plt.ylabel('Price ($)')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution of price by models
plt.figure(figsize=(10, 8))
sns.set_palette("bright")
sns.histplot(data = df, x='Price', kde = True, hue='Brand')
plt.title('Distribution of price by models')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

## Outliers 

In [None]:
sns.displot(df['Price'], bins=20)

In [None]:
sns.boxplot(x=df['Price'])

In [None]:
prices = df['Price']
q75, q25 = np.percentile(prices, [75, 25])
iqr = q75 - q25
lower_limit = q25 - 1.5*iqr
upper_limit = q75 + 1.5*iqr

In [None]:
# Target - Price
# Correlation
df.corr(numeric_only=True)['Price']

### Visualization

In [None]:
sns.scatterplot(data = df, y='Price', x = 'Ram_numeric')
plt.title('The price depends on the amount of Ram')
plt.show()

In [None]:
# There is a potential outlier - 32 GB and the price is >5000
df[(df['Ram_numeric'] == 32) & (df['Price'] > 5000)]

In [None]:
sns.scatterplot(data = df, y='Price', x = 'CPU_threads')
plt.title('The price depends on the threads of CPU')
plt.show()

In [None]:
# There is a potential outlier - 5 cores and the price is >1000
df[(df['CPU_threads'] == 5) & (df['Price'] > 1000)]

In [None]:
# 32 cores and price < 1500
df[(df['CPU_threads'] == 32) & (df['Price'] <1500 )]

In [None]:
# The price of a laptop with a 1TB SSD and 16 gb Ram is clearly underestimated, while the rating is high
# 20 cores and the price is > 5000
df[(df['CPU_threads'] == 20) & (df['Price'] > 5000)]

In [None]:
sns.scatterplot(data = df, y='Price', x = 'CPU_cores')
plt.title('The price depends on the cores CPU')
plt.show()

In [None]:
df[(df['CPU_cores'] == 14) & (df['Price'] > 5000)]

In [None]:
sns.scatterplot(data = df, y='Price', x = 'Rating')
plt.title('The price depends on the rating')
plt.show()

In [None]:
df[(df['Rating'] < 65) & (df['Price'] > 3000)]

In [None]:
sns.scatterplot(data = df, y='Price', x = 'SSD_value')
plt.title('The price depends on the value SSD')
plt.show()

In [None]:
df[(df['SSD_value'] < 2500) & (df['Price'] > 5000)]

In [None]:
# unclear why the price is so low with such high rate
df[(df['SSD_value'] > 2000) & (df['Price'] < 1000)]

As a result, we can single out a laptop with the index 682 - it is more common than others in the sample.

In [None]:
df.drop(682, inplace=True)
df.reset_index(drop=True, inplace=True)

## Missing data

In [None]:
# missing data in percent
def percent_missing(my_df):
  result = 100*my_df.isnull().sum()/len(my_df)
  result = result[result>0].sort_values()
  return result

In [None]:
percent_nan = percent_missing(df)

In [None]:
sns.barplot(x=percent_nan.index, y=percent_nan)
plt.xlabel('Columns')
plt.ylabel('Percent (%)')

In [None]:
df[df['CPU_cores'].isnull()]

In [None]:
df = df.dropna(axis=0, subset='CPU_cores')

In [None]:
percent_missing(df)

In [None]:
# These models have missing data 
len(df[df['CPU_threads'].isnull()])

For the 2023-2024 MacBook models on Apple Silicon chips, the number of threads is always equal to the number of physical cores, because the Apple architecture does not have an Intel Hyper-Threading counterpart.
That is, 1 core = 1 thread. Therefore, you can fill in the values equal to the number of cores

In [None]:
df['CPU_threads'][df['Brand'] == 'Apple'] = df['CPU_cores'][df['Brand'] == 'Apple']
len(df[df['CPU_threads'].isnull()])

In [None]:

#df[df['CPU_threads'].isnull()]['Model'].to_list()

The 13th Gen Core i7 processors have an average of 16 to 24 threads - let's take an average of 20 threads.

In [None]:
mask = df['Generation'].str.contains('13th Gen Intel Core i7', case=False, na=False, regex=False)

In [None]:
df.loc[mask, 'CPU_threads'] = df.loc[mask, 'CPU_threads'] = df.loc[mask, 'CPU_threads'].fillna(20)

# checking
df[df['CPU_threads'].isnull()]['Generation'].to_list()

The MediaTek Companio 520 is a mobile ARM chip.
This is standard for ARM processors: the number of threads is always equal to the number of cores.

In [None]:
mask = df['Generation'].str.contains('MediaTek Kompanio')
df.loc[mask, 'CPU_threads'] = df.loc[mask, 'CPU_threads'] = df.loc[mask, 'CPU_threads'].fillna(20)

Also, by googling, we find out that

Atom Z3735F → 4 streams

Celeron N4500 → 2 streams

Core i9-13950HX → 32 threads

In [None]:
df['CPU_threads'][df['Generation'] == 'Intel Atom Quad Core Z3735F'] = 4
df['CPU_threads'][df['Generation'] == 'Intel Celeron  N4500'] = 2
df['CPU_threads'][df['Generation'] == '13th Gen Intel Core i9 13950HX'] = 32

In [None]:
percent_missing(df)

### Rating.

In [None]:
# we can take an rating price by brands
plt.figure(figsize=(16,12), dpi=100)
sns.boxplot(x = df['Rating'], y=df['Brand'], data=df,orient='horizontal')
plt.show()

In [None]:
# average rating
df.groupby('Brand')['Rating'].mean()

In [None]:
# using transformer to replace it with grouped data.
df['Rating'] = df.groupby('Brand')['Rating'].transform(lambda value: value.fillna(value.mean()))

df['Rating'].isnull().sum()

In [None]:
# replace the remaining 3 with simply averaged value
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

In [None]:
percent_missing(df)

## Categorical data

In [None]:
df['Warranty_years'].value_counts()

In [None]:
df.head()

In [None]:
dummies = pd.get_dummies(df['Warranty_years'], drop_first=True, prefix='Warranty') # drop_first=True
df = pd.concat([df, dummies], axis=1).drop('Warranty_years', axis=1)

df.head(5)

## Prediction Models

In [None]:
# preparing data
y = df['Price']
X = df[['Warranty_2','Warranty_3', 'Ram_numeric', 'Diagonal', 'SSD_value', 'CPU_cores', 'CPU_threads']]


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# SCALE
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### ElasticNet model

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

base_elastic_net_model = ElasticNet()

# Parameters
alphas = np.logspace(-2, 3, 30)
param_grid = {'alpha':alphas,
              'l1_ratio':[.1,.5,.7, .95, .99, 1]}

grid_model = GridSearchCV(base_elastic_net_model,
                          param_grid=param_grid,
                          scoring = 'neg_mean_squared_error',
                          cv = 5,)

grid_model.fit(X_train,y_train)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_pred = grid_model.predict(X_test)

print(f"MSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"r2 score: {r2_score(y_test, y_pred)}")

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Comparison predicted and real values of Price")
plt.grid(True)
plt.show()

In [None]:
error = y_test - y_pred
plt.figure(figsize=(8,6))
plt.scatter(y_test, error, color='blue', alpha=0.6)

plt.xlabel("Actual Price")
plt.ylabel("error")
plt.xlim(0, 3000)
#plt.ylim(-1500, 1500)
plt.title("Comparison predicted and real values of Price")
plt.grid(True)
plt.show()

### RandomForest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

param_grid = {
     'n_estimators': [100, 300],  # меньше значений для скорости
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'max_features': [0.3, 'sqrt'],  # меньше вариантов
    'max_samples': [0.7, 0.8, 0.9],  # контроль размера подвыборки
    'n_jobs': [-1]  # параллелизация
}
grid_rfr = GridSearchCV(rfr, param_grid=param_grid, cv=5)

grid_rfr.fit(X_train, y_train)
preds_rfr = grid_rfr.predict(X_test)
print(f"MSE: {np.sqrt(mean_squared_error(y_test, preds_rfr))}")
print(f"r2 score: {r2_score(y_test, preds_rfr)}")

In [None]:
error = y_test - preds_rfr

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, error, color='blue', alpha=0.6)
plt.xlabel("Actual Price")
plt.ylabel("error")
plt.xlim(0, 2000)
plt.ylim(-1000, 1000)
plt.title("Comparison predicted and real values of Price")
plt.grid(True)
plt.show()

## Сonclusion

Two machine learning algorithms were considered for the laptop price prediction task using grid search and cross-validation. The evaluation results showed that both approaches achieved comparable error metrics.

An analysis of the error distribution revealed differences in model behavior. The Random Forest model exhibits a more uniform increase in absolute error as laptop prices rise: the lower the price, the smaller the prediction error. In contrast, the linear model shows significant deviations in some cases, even within the low-price segment.

Despite less stable cross-validation results, the Random Forest model provides more predictable error behavior across price segments, making it more suitable for practical use within this dataset.

The modeling results largely depend on the quality and size of the original dataset. In particular, high-priced laptops are underrepresented, which leads to increased prediction errors in the upper price segment.

The analysis also showed that the relationship between price and laptop characteristics is predominantly monotonic; however, certain features exhibit nonlinear effects that are better captured by ensemble models.