### Data Manipulation and Analysis with Pandas

Data manipulation and analysis are key tasks in any data science or data analysis project.
Pandas provides a wide range of functions for data manipulation and analysis, making it easier to cean, transform, and extract insights from data.

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv('data.csv')
## Fetch the first 5 rows
df.head(5)

Unnamed: 0,Date,Category,Product,Sales_Value,Region
0,2025-04-23,Clothing,,30676.31,
1,2025-10-04,Sports,Football,17909.51,South
2,2025-06-01,Electronics,,6136.06,South
3,2025-04-04,Sports,,35204.68,West
4,2025-07-29,Furniture,Bed,44949.44,Central


In [12]:
df.tail(5)

Unnamed: 0,Date,Category,Product,Sales_Value,Region
45,2025-08-21,Beauty,Shampoo,19067.24,East
46,,,Toaster,48666.25,West
47,2025-02-02,Home Appliances,Blender,21059.2,East
48,2025-08-09,Clothing,Jeans,32251.62,West
49,2025-09-15,Beauty,Lotion,27292.32,


In [13]:
df.describe()

Unnamed: 0,Sales_Value
count,45.0
mean,24409.550222
std,14346.37285
min,646.27
25%,13304.03
50%,24422.47
75%,35204.68
max,49804.36


In [14]:
df.dtypes

Date            object
Category        object
Product         object
Sales_Value    float64
Region          object
dtype: object

In [15]:
# Handling Missing Values
df.isnull().any()

Date           True
Category       True
Product        True
Sales_Value    True
Region         True
dtype: bool

In [16]:
df.isnull().sum()

Date           5
Category       5
Product        5
Sales_Value    5
Region         5
dtype: int64

In [17]:
df_filled = df.fillna(0) # It will replace to zero in missing values

In [18]:
# Filling missing values with the mean of the column

df['Sales_fillNA'] = df['Sales_Value'].fillna(df['Sales_Value'].mean())
df

Unnamed: 0,Date,Category,Product,Sales_Value,Region,Sales_fillNA
0,2025-04-23,Clothing,,30676.31,,30676.31
1,2025-10-04,Sports,Football,17909.51,South,17909.51
2,2025-06-01,Electronics,,6136.06,South,6136.06
3,2025-04-04,Sports,,35204.68,West,35204.68
4,2025-07-29,Furniture,Bed,44949.44,Central,44949.44
5,,Sports,Gym Gloves,23237.92,,23237.92
6,2025-08-30,Clothing,Dress,16813.32,North,16813.32
7,2025-01-16,,Cream,29586.63,Central,29586.63
8,2025-10-05,Clothing,T-shirt,,,24409.550222
9,2025-08-24,Beauty,Perfume,7895.92,,7895.92


In [19]:
df.dtypes

Date             object
Category         object
Product          object
Sales_Value     float64
Region           object
Sales_fillNA    float64
dtype: object

In [20]:
# Renaming Columns

df = df.rename(columns={'Date' : 'Sale_Date'})
df.head()

Unnamed: 0,Sale_Date,Category,Product,Sales_Value,Region,Sales_fillNA
0,2025-04-23,Clothing,,30676.31,,30676.31
1,2025-10-04,Sports,Football,17909.51,South,17909.51
2,2025-06-01,Electronics,,6136.06,South,6136.06
3,2025-04-04,Sports,,35204.68,West,35204.68
4,2025-07-29,Furniture,Bed,44949.44,Central,44949.44


In [21]:
# Change Datatypes

# Replace missing values in 'Sales_Value' with the column's mean,
# convert all values to integers, and store in a new column 'Value_new'
df['Value_new'] = df['Sales_Value'].fillna(df['Sales_Value'].mean()).astype(int)

# Show first 5 rows to verify changes
df.head()

Unnamed: 0,Sale_Date,Category,Product,Sales_Value,Region,Sales_fillNA,Value_new
0,2025-04-23,Clothing,,30676.31,,30676.31,30676
1,2025-10-04,Sports,Football,17909.51,South,17909.51,17909
2,2025-06-01,Electronics,,6136.06,South,6136.06,6136
3,2025-04-04,Sports,,35204.68,West,35204.68,35204
4,2025-07-29,Furniture,Bed,44949.44,Central,44949.44,44949


In [22]:
# Create a new column 'New_Product' by applying a function to each value in 'Product'
# The lambda function duplicates each product name (e.g., "Laptop" → "LaptopLaptop")
df['New_Product'] = df['Product'].apply(lambda x: x * 2)

# Display first 5 rows to see the result
df.head()

Unnamed: 0,Sale_Date,Category,Product,Sales_Value,Region,Sales_fillNA,Value_new,New_Product
0,2025-04-23,Clothing,,30676.31,,30676.31,30676,
1,2025-10-04,Sports,Football,17909.51,South,17909.51,17909,FootballFootball
2,2025-06-01,Electronics,,6136.06,South,6136.06,6136,
3,2025-04-04,Sports,,35204.68,West,35204.68,35204,
4,2025-07-29,Furniture,Bed,44949.44,Central,44949.44,44949,BedBed


#### Data Aggregating and Grouping

In [23]:
df.head()

Unnamed: 0,Sale_Date,Category,Product,Sales_Value,Region,Sales_fillNA,Value_new,New_Product
0,2025-04-23,Clothing,,30676.31,,30676.31,30676,
1,2025-10-04,Sports,Football,17909.51,South,17909.51,17909,FootballFootball
2,2025-06-01,Electronics,,6136.06,South,6136.06,6136,
3,2025-04-04,Sports,,35204.68,West,35204.68,35204,
4,2025-07-29,Furniture,Bed,44949.44,Central,44949.44,44949,BedBed


In [24]:
# Group the DataFrame rows based on unique 'Product' names and calculate the average (mean) of 'Sales_Value' for each product group
grouped_mean = df.groupby('Product')['Sales_Value'].mean()

print(grouped_mean)

Product
Bed           24607.743333
Blender       21059.200000
Camera        38405.247500
Chair          4247.245000
Cream         35659.076667
Cupboard      13304.030000
Dress         16813.320000
Football      17909.510000
Gym Gloves    10779.680000
Iron          17205.580000
Jacket        26298.100000
Jeans         37578.550000
Laptop        45098.690000
Lipstick      41711.990000
Lotion        22219.625000
Perfume        7895.920000
Shampoo       20975.603333
Sofa          14000.800000
T-shirt                NaN
Table          6030.020000
Tablet        23409.597500
Toaster       43011.790000
Name: Sales_Value, dtype: float64


In [25]:
grouped_sum = df.groupby(['Product', 'Region'])['Sales_Value'].sum()
print(grouped_sum)

Product     Region 
Bed         Central    44949.44
            East       18993.70
            West        9880.09
Blender     East       21059.20
Camera      Central    47858.71
            South      36798.54
            West       68963.74
Chair       North       7848.22
            South        646.27
            West           0.00
Cream       Central    58270.81
            South      48706.42
Cupboard    North      13304.03
Dress       North      16813.32
Football    South      17909.51
Gym Gloves  East        2749.21
            South       6351.91
Iron        Central    17205.58
Jacket      Central        0.00
            East       26298.10
Jeans       Central    42905.48
            West       32251.62
Laptop      Central    45098.69
Lipstick    Central    41711.99
Lotion      West       17146.93
Shampoo     East       19067.24
            South      25948.33
            West       17911.24
Sofa        North      14000.80
Table       Central     6030.02
            South   

In [27]:
df.groupby(['Product', 'Region'])['Sales_Value'].mean()

Product     Region 
Bed         Central    44949.440
            East       18993.700
            West        9880.090
Blender     East       21059.200
Camera      Central    47858.710
            South      36798.540
            West       34481.870
Chair       North       7848.220
            South        646.270
            West             NaN
Cream       Central    29135.405
            South      48706.420
Cupboard    North      13304.030
Dress       North      16813.320
Football    South      17909.510
Gym Gloves  East        2749.210
            South       6351.910
Iron        Central    17205.580
Jacket      Central          NaN
            East       26298.100
Jeans       Central    42905.480
            West       32251.620
Laptop      Central    45098.690
Lipstick    Central    41711.990
Lotion      West       17146.930
Shampoo     East       19067.240
            South      25948.330
            West       17911.240
Sofa        North      14000.800
Table       Central    

In [29]:
# Aggregate multiple functions

grouped_agg = df.groupby('Region')['Sales_Value'].agg(['mean', 'sum', 'count'])
grouped_agg

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Central,34290.123,342901.23,10
East,18799.28375,150394.27,8
North,20354.146,101770.73,5
South,18423.651,184236.51,10
West,28753.06875,230024.55,8


In [30]:
# Merging and joining Dataframes
# Create sample Dataframes

df1 = pd.DataFrame({'Key' : ['A', 'B', 'C'], 'Value1' : [1, 2, 3]})
df2 = pd.DataFrame({'Key' : ['A', 'B', 'D'], 'Value1' : [4, 5, 6]})

In [31]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [32]:
df2

Unnamed: 0,Key,Value1
0,A,4
1,B,5
2,D,6


In [35]:
# Merge DataFrame on the 'Key Columns'
pd.merge(df1, df2, on = "Key", how = "inner")

# pd.merge() → function used to combine two DataFrames.
# df1, df2 → the two DataFrames being merged.
# on="Key" → specifies the common column ('Key') used for matching rows.
# how="inner" → performs an inner join, meaning only rows with matching 'Key' values in both DataFrames will be included in the result.

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1,4
1,B,2,5


In [36]:
pd.merge(df1, df2, on = "Key", how = "outer")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [37]:
pd.merge(df1, df2, on = "Key", how = "left")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [38]:
pd.merge(df1, df2, on = "Key", how = "right")

Unnamed: 0,Key,Value1_x,Value1_y
0,A,1.0,4
1,B,2.0,5
2,D,,6
