Data cleaning and preprocessing.

Steps
- Load Dataset/ Dataset Overview.

- Check missing Values.
   - Drop down rows, fill with mean/median/mode.

 - Detect outliers in numeric columns

 - Encode  Categorical Features  
    - Use Label Encoding for ordinal categories.
    - Use One - Hot Encoding for norminal features.

- Visual Inspections
  - use Heatmaps, Barcharts/Histograms to visualize missing patterns.


Load Datasets / Datasets Overview

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import scipy as sp

# Load datasets.
file_path = "Datasets/laptop_price.csv"
data = pd.read_csv(file_path, encoding="latin1")
data

In [None]:
# Display the first 10 Rows of dataset.
data.head(10)

In [None]:
# Display the core datasets column, datatypes and memory alloations.
data.info()

In [None]:
# Display 10 random rows.
data.sample(10)

In [None]:
# Display columns
data.columns

In [None]:
# Standardise column names to lowercase and add underscores.
data.columns = data.columns.str.lower().str.replace("-", "")
data.columns

In [None]:
# Correct column abbreviated column names.
data.rename(columns={"cpu": "device_processor (CPU)", "ram": "device_memory (RAM)", "opsys" : "operating_system", "typename": "laptop_type", "memory": "memory_capacity"}, inplace=True)
data.columns

In [None]:
# Correct screen resolution column name.
data.rename(columns={"screenresolution": "screen_resolution", "company" : "manufacturers_info"}, inplace=True)
data.columns

In [None]:
# Get column summary statistics
data.describe()

In [None]:
# Display summary of categorical columns.
data.describe(include="object")

CHECK FOR MISSING VALUES.

In [None]:
# Check for missing values.
data.isnull().sum()

In [None]:
# Check for missing values in categorical variables.
missing_Values = data.select_dtypes(include="object").isnull().sum()
missing_Values

In [None]:
# Use boolean mask to check for rows with missing values.
data_mask = data.isnull()
rows_with_missing_var = data[data_mask.any(axis=1)]
print(rows_with_missing_var)
print("Report!\nThere are no rows with missing values.")



In [None]:
# Display of full metadata of rows with missing data.
rows_with_missing_var.info()

In [None]:
# Inspect missing data in each column to find missing values.
for column in data.select_dtypes(include=["object"]).columns:
    detail = data[column].unique()
    print('\nUnique values in {} are :{}'.format(column, data[column].unique()))
  

In [None]:
# Visualize inspection of missing data
# Use heatmaps to chek missing values.
plt.figure(figsize=(12, 8))
sns.heatmap(data[["manufacturers_info", "product", "laptop_type", "device_processor (CPU)", "device_memory (RAM)", "operating_system"]][:100].isnull(), cbar=True, cmap="viridis")
plt.title("Heatmap of Missing Values")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Bar chart for missing values per column
missing_values = data.isnull().sum()
missing_values.plot(kind="bar", figsize=(14, 6))
plt.title("Missing Values per Column")
plt.xlabel("Columns")
plt.ylabel("Count of Missing Values")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Detect missing data using MissingNo Dendrogram
msno.dendrogram(data)
plt.title("Dendrogram of Missingness")
plt.show()

DETECT OUTLIERS IN NUMERIC COLUMNS

In [None]:
# Using IQR Method
numeric_col = data.select_dtypes(include=["int64", "float64"]).columns
outliers_iqr = {}
for col in numeric_col:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_iqr[col] = data[(data[col] < lower_bound) | (data[col] > upper_bound)][col]
    print("Outliers found in {} using IQR are:".format(col))
    print(outliers_iqr[col].head())

In [None]:

# Ensure numeric_col is defined
if 'numeric_col' not in locals():
    numeric_col = data.select_dtypes(include=["int64", "float64"]).columns

# Using Z-Score Method.
statistics = sp.stats
outliers_score = {}
for col in numeric_col:
    z_scores = statistics.zscore(data[col].dropna())
    abs_z_scores = np.abs(z_scores)
    outliers = data[col][abs_z_scores > 3]
    outliers_score[col] = outliers
    print("Outliers found in {} using z-score are:".format(col))
    print(outliers.head())




In [None]:
# Box plots for outlier visualization.
for col in numeric_col:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=data[col])
    plt.title("Box plot of {}".format(col))
    plt.show()


In [None]:
# Boxplots to display outliers using quartiles.
data[numeric_col].plot(kind="box", subplots=True, layout=(2, 4), figsize=(15, 10), title="Box plots of Numeric Columns")


In [None]:
# Scatter plots Displaying Outliers.
for col in numeric_col:
    plt.figure(figsize=(10, 6))
    plt.scatter(data.index, data[col])
    plt.title("Scatter plot of {}".format(col))
    plt.xlabel("Index")
    plt.ylabel(col)
    plt.show()

In [None]:
# Display relationship between numeric of outliers of z-score and IQR methods.
for col in numeric_col:
    plt.figure(figsize=(12, 6))
    plt.scatter(data.index, data[col], label= "Data Pointa", color="purple", alpha=0.05)
    plt.scatter(outliers_iqr[col].index, outliers_iqr[col], label="IQR Outliers", color="red", alpha=0.6)
    plt.scatter(outliers_score[col].index, outliers_score[col], label="z_scores Outliers", color="green", alpha=0.6)
    plt.title("Outliers in {} using IQR and Z-Score methods".format(col))
    plt.xlabel("Index")
    plt.ylabel(col)
    plt.legend()
    plt.show()


 - Encode  Categorical Features  
    - Use Label Encoding for ordinal categories.
    - Use One - Hot Encoding for norminal features.

In [None]:
# Use of Label Encoding to convert categorical variabls.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for col in data.select_dtypes(include=["object"]).columns:
    data[col] = label_encoder.fit_transform(data[col].astype(str))
    data[col].head()

In [None]:
data.info()

In [None]:
# use one hot encoding to convert categorical variables.
data = pd.get_dummies(data, drop_first=True)
data.info()

In [None]:
# Save Final Cleaned Data sets
data.to_csv("Datasets/cleaned_laptop_price_data.csv", index=False, encoding="utf-8")