In [4]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [5]:
# Step 2: Load the dataset
df = pd.read_csv("data.csv")

# Dataset Description:
# - Sales Person: Name of the person who made the sale
# - Country: The country where the product was sold
# - Product: Name/type of product
# - Date: The date on which the sale was made
# - Amount: The sale amount in dollars (string with $ and comma)
# - Boxes Shipped: Number of boxes shipped in the transaction

# View first few rows
df.head()


Unnamed: 0,Sales Person,Country,Product,Date,Amount,Boxes Shipped
0,Jehu Rudeforth,UK,Mint Chip Choco,04-Jan-22,"$5,320",180
1,Van Tuxwell,India,85% Dark Bars,01-Aug-22,"$7,896",94
2,Gigi Bohling,India,Peanut Butter Cubes,07-Jul-22,"$4,501",91
3,Jan Morforth,Australia,Peanut Butter Cubes,27-Apr-22,"$12,726",342
4,Jehu Rudeforth,UK,Peanut Butter Cubes,24-Feb-22,"$13,685",184


In [6]:
# Step 3: Basic Information
print("Shape of dataset:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSummary Statistics:")
print(df.describe(include='all'))


Shape of dataset: (1094, 6)

Data Types:
Sales Person     object
Country          object
Product          object
Date             object
Amount           object
Boxes Shipped     int64
dtype: object

Missing Values:
Sales Person     0
Country          0
Product          0
Date             0
Amount           0
Boxes Shipped    0
dtype: int64

Summary Statistics:
         Sales Person    Country         Product       Date   Amount  \
count            1094       1094            1094       1094     1094   
unique             25          6              22        168      827   
top     Kelci Walkden  Australia  50% Dark Bites  13-Jan-22  $2,317    
freq               54        205              60         16        5   
mean              NaN        NaN             NaN        NaN      NaN   
std               NaN        NaN             NaN        NaN      NaN   
min               NaN        NaN             NaN        NaN      NaN   
25%               NaN        NaN             NaN        NaN 

In [11]:
# Step 4: Data Preprocessing

# 1. Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')
# Explanation: This ensures that all dates are properly recognized as date objects for time-based operations.

# 2. Convert 'Amount' to float by removing dollar signs and commas
df['Amount'] = df['Amount'].replace('[\$,]', '', regex=True).replace(',', '', regex=True).astype(float)
# Explanation: The 'Amount' column originally has a dollar sign and commas, which prevents numerical operations.
# This line removes those characters and converts the data to float for mathematical computations.

# 3. Confirm updated data types
print(df.dtypes)
# Explanation: Checking if 'Date' is datetime64 and 'Amount' is float64 after the conversions.


Sales Person                    object
Country                         object
Product                         object
Date                    datetime64[ns]
Amount                         float64
Boxes Shipped                    int64
Sales Person Encoded             int32
Country Encoded                  int32
Product Encoded                  int32
dtype: object


  df['Amount'] = df['Amount'].replace('[\$,]', '', regex=True).replace(',', '', regex=True).astype(float)


In [8]:
# Step 6: Convert Categorical Variables to Quantitative

# Use LabelEncoder from sklearn
label_encoder = LabelEncoder()

# Encoding categorical features
df['Sales Person Encoded'] = label_encoder.fit_transform(df['Sales Person'])
df['Country Encoded'] = label_encoder.fit_transform(df['Country'])
df['Product Encoded'] = label_encoder.fit_transform(df['Product'])

# Show encoded columns
df[['Sales Person', 'Sales Person Encoded', 
    'Country', 'Country Encoded', 
    'Product', 'Product Encoded']].head()


Unnamed: 0,Sales Person,Sales Person Encoded,Country,Country Encoded,Product,Product Encoded
0,Jehu Rudeforth,13,UK,4,Mint Chip Choco,14
1,Van Tuxwell,23,India,2,85% Dark Bars,2
2,Gigi Bohling,9,India,2,Peanut Butter Cubes,17
3,Jan Morforth,12,Australia,0,Peanut Butter Cubes,17
4,Jehu Rudeforth,13,UK,4,Peanut Butter Cubes,17
