In [1]:
# This notebook demonstrates splitting features in a sales dataset.
import pandas as pd

# Load the sales data from a CSV file.
df = pd.read_csv(r"sales.csv")
# Display the first 5 rows to get an idea of the data.
df.head()

Unnamed: 0,Invoice ID,Branch,City(nature),Customer type,Gender,Product line,Unit price,Quantity,Profit,Total,Time,Payment,cogs,Total.1,gross income,Rating,date_visited
0,750-67-8428,A,Yangon(Local),Member,Female,Health and beauty,74.69,7.0,26.1415,548.9715,13:08,Ewallet,522.83,4.761905,26.1415,9.1,02-10-2016
1,226-31-3081,C,Naypyitaw(Local),Normal,Female,Electronic accessories,15.28,5.0,3.82,80.22,10:29,Cash,76.4,4.761905,3.82,9.6,26-08-2018
2,&&&,A,Yangon(Local),Normal,Male,Home and lifestyle,46.33,6.0,16.2155,340.5255,13:23,Credit card,324.31,4.761905,16.2155,7.4,14-09-2020
3,123-19-1176,A,Yangon(Local),Member,Male,Health and beauty,58.22,8.0,23.288,489.048,20:33,Ewallet,465.76,4.761905,23.288,8.4,10-10-2022
4,373-73-7910,A,Yangon(Local),Normal,,Sports and travel,&&&,7.0,30.2085,634.3785,10:37,Ewallet,604.17,4.761905,30.2085,5.3,28-09-2019


In [3]:
# Split the 'City(nature)' column into two new columns: 'City' and 'City_Type'.
# A regular expression is used to extract the city name and the type in parentheses.
df[['City', 'City_Type']] = df['City(nature)'].str.extract(r'([A-Za-z\s]+)\((\w+)\)')

In [4]:
# Convert the 'date_visited' column to datetime objects.
df['date_visited'] = pd.to_datetime(df['date_visited'], format='%d-%m-%Y', errors='coerce')
# Extract the day, month, and year into separate columns.
df['visit_day'] = df['date_visited'].dt.day
df['visit_month'] = df['date_visited'].dt.month
df['visit_year'] = df['date_visited'].dt.year

In [5]:
# Display the new columns to verify the feature splitting.
df[['City', 'City_Type', 'visit_day', 'visit_month', 'visit_year']].head()

Unnamed: 0,City,City_Type,visit_day,visit_month,visit_year
0,Yangon,Local,2,10,2016
1,Naypyitaw,Local,26,8,2018
2,Yangon,Local,14,9,2020
3,Yangon,Local,10,10,2022
4,Yangon,Local,28,9,2019


In [6]:
# Export the modified DataFrame to a new CSV file.
df.to_csv("sales_feature_split.csv", index=False)