In [1]:
import numpy as np
import pandas as pd

In [2]:
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)

NumPy Version: 1.26.4
Pandas Version: 2.2.2


# Data Manipulation with Pandas

## Topics Covered:
- Reading a CSV file
- Handling Missing Values
    - Checking for Missing Values
    - Filling Missing Values
- Data Aggregating and Grouping
- Creating, Merging and Joining Dataframes

## Reading a CSV File:

- **variable = pd.read_csv('FileName.csv')**
- **variable.head()**
    - Gives the first 5 rows of the DataFrame by default
- **variable.tail()**
    - Gives the last 5 rows of the DataFrame by default
- **variable.dtypes**
    - Gives the datatypes of the values that the columns contain (object = String)
- **variable.describe()**
    - total count, mean, min, max, standard deviation, (25, 50, 75)%ile of the relevant columns 

In [3]:
df = pd.read_csv('data.csv')

In [55]:
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0,52.0


In [54]:
df.tail()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
45,2023-02-15,B,99.0,Product2,599.0,West,599.0,99,198.0,198.0
46,2023-02-16,B,6.0,Product1,938.0,South,938.0,6,12.0,12.0
47,2023-02-17,B,69.0,Product3,143.0,West,143.0,69,138.0,138.0
48,2023-02-18,C,65.0,Product3,182.0,North,182.0,65,130.0,130.0
49,2023-02-19,C,11.0,Product3,708.0,North,708.0,11,22.0,22.0


In [6]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [53]:
df.dtypes

# object is basically String

Sales Date             object
Category               object
Value                 float64
Product                object
Sales                 float64
Region                 object
Sales_fillNA          float64
Value_new               int32
New_Value             float64
Multiply_New_Value    float64
dtype: object

## Handling Missing Values

### Checking for Missing Values

- **variable.isnull()**
    - returns the entire DataFrame
    - True: Missing Value
    - False: No Missing Value
- **variable.isnull().any(axis = 1)**
    - Lists all Row Numbers one below the other
    - True: Missing Value
    - False: No Missing Value
- **variable.isnull().any()**
    - Lists all Column Names one below another
    - True: Missing Value
    - False: No Missing Value
- **variable.isnull().sum()**
    - Lists all Column Names one below another
    - Shows the total number of missing values each Column contains
- **variable.isnull().any(axis = 1).sum()**
    - Shows total number of Rows which have Missing Values

In [10]:
df.isnull()

# False - No Missing Value
# True - Has Missing Value

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [11]:
df.isnull().any(axis = 1)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12    False
13    False
14    False
15     True
16    False
17     True
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28     True
29    False
30    False
31    False
32    False
33     True
34    False
35     True
36    False
37     True
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
dtype: bool

In [12]:
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [13]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [59]:
df.isnull().any(axis = 1).sum()

7

## Handling Missing Values

### Filling Missing Values
- **variable.fillna(0)**
    - Fills all Missing Values with 0
- **variable.['Sales'].fillna(variable['Sales'].mean())**
    - Fills all Missing Values of 'Sales' column with the mean of the rest of the values
- **variable = variable.rename(columns = {'Date':'Sales Date'})**
    - Renames a column
- **variable['Value'].fillna(varaible['Value'].mean()).astype(int)**
    - Changing the datatype of values of 'Value' Column
- **df['Value'].apply(lambda x:x*2)**
    - Changing the values of 'Value' column using lambda function

In [60]:
df.fillna(0)

# Missing Values replaced with 0

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0,52.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0,54,108.0,108.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0,16,32.0,32.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0,89,178.0,178.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0,37,74.0,74.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0,22,44.0,44.0


In [61]:
df_filled = df.fillna(0)
df_filled
# Copying the new dataframe into another dataframe

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0,52.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0,54,108.0,108.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0,16,32.0,32.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0,89,178.0,178.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0,37,74.0,74.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0,22,44.0,44.0


In [19]:
# Filling Missing Values with the mean of the column
# Creating new column too
df['Sales_fillNA'] = df['Sales'].fillna(df['Sales'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0


In [21]:
df.dtypes

# object basically means String

Date             object
Category         object
Value           float64
Product          object
Sales           float64
Region           object
Sales_fillNA    float64
dtype: object

In [23]:
# Renaming Columns

df = df.rename(columns = {'Date':'Sales Date'})
df.head(3)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0


In [27]:
# Change Datatypes
# Created column Value_new and changed datatype from float to int

df['Value_new'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df.head(3)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32


In [30]:
df['Multiply_New_Value'] = df['Value'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0,52.0


## Data Grouping

- **grouped_mean_product_value = df.groupby('Product')['Value'].mean()**
    - Gives the mean of values of 'Value' of every 'Product'
- **grouped_mean_product_sales = df.groupby('Product')['Sales'].mean()**
    - Gives the mean of values of 'Sales' of every 'Product'
- **grouped_mean_region_value = df.groupby('Region')['Value'].mean()**
    - Gives the mean of values of 'Value' of every 'Region'
- **grouped_mean_region_sales = df.groupby('Region')['Sales'].mean()**
    - Gives the mean of values of 'Sales' of every 'Region'
- **grouped_sum_region_sales = df.groupby('Region')['Sales'].sum()**
    - Gives the sum of values of 'Sales' of every 'Region'
 
- **grouped_region_value_sum = df.groupby(['Product','Region'])['Value'].sum()**
    - Sum of values of 2 parameters
    - Gives the sum of values of 'Value' of every 'Product' for every 'Region'
- **grouped_region_mean = df.groupby(['Product','Region'])['Value'].mean()**
    - Mean of 'Value' by 2 parameters
    - Gives the mean of values of 'Value' of every 'Product' for every 'Region'

## Data Aggregating

### Aggregating Multiple Functions

- **grouped_agg1 = df.groupby(['Product','Region'])['Value'].agg(['mean','sum','count'])**
    - For every product for every region, find the Mean, Total Sum and Total Count or Frequency

In [66]:
# Data Aggregating and Grouping

df.head()

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_fillNA,Value_new,New_Value,Multiply_New_Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56.0,56.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78.0,78.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64.0,64.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16.0,16.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52.0,52.0


In [74]:
# Mean of 'Value' by one parameter

grouped_mean_product_value = df.groupby('Product')['Value'].mean()
print(grouped_mean)

grouped_mean_product_sales = df.groupby('Product')['Sales'].mean()
print(grouped_mean_product_sales)

grouped_mean_region_value = df.groupby('Region')['Value'].mean()
print(grouped_mean_region_value)

grouped_mean_region_sales = df.groupby('Region')['Sales'].mean()
print(grouped_mean_region_sales)

grouped_sum_region_sales = df.groupby('Region')['Sales'].sum()
print(grouped_sum_region_sales)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64
Product
Product1    574.866667
Product2    567.230769
Product3    535.055556
Name: Sales, dtype: float64
Region
East     42.307692
North    37.666667
South    62.000000
West     61.588235
Name: Value, dtype: float64
Region
East     584.7500
North    600.8000
South    644.7500
West     465.3125
Name: Sales, dtype: float64
Region
East     7017.0
North    6008.0
South    5158.0
West     7445.0
Name: Sales, dtype: float64


In [72]:
# Sum of 'Value' by 2 parameters

grouped_region_value_sum = df.groupby(['Product','Region'])['Value'].sum()
print(grouped_region_value_sum)

grouped_region_sales_sum = df.groupby(['Product','Region'])['Sales'].sum()
print(grouped_region_sales_sum)

#grouped_region_sum = df.groupby(['Product','Region'])['Value'].sum()
#print(grouped_region_sum) - not compilable

Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64
Product   Region
Product1  East      4205.0
          North     1737.0
          South     1346.0
          West      1335.0
Product2  East       856.0
          North      843.0
          South     2240.0
          West      3435.0
Product3  East      1956.0
          North     3428.0
          South     1572.0
          West      2675.0
Name: Sales, dtype: float64


In [75]:
# Mean of 'Value' by 2 parameters

grouped_region_mean = df.groupby(['Product','Region'])['Value'].mean()
print(grouped_region_mean)

grouped_region_sales_mean = df.groupby(['Product','Region'])['Sales'].mean()
print(grouped_region_sales_mean) 

Product   Region
Product1  East      41.714286
          North      4.500000
          South     50.000000
          West      82.000000
Product2  East      28.000000
          North     63.500000
          South     60.333333
          West      53.500000
Product3  East      50.500000
          North     40.600000
          South     71.666667
          West      62.166667
Name: Value, dtype: float64
Product   Region
Product1  East      600.714286
          North     868.500000
          South     673.000000
          West      333.750000
Product2  East      428.000000
          North     421.500000
          South     746.666667
          West      572.500000
Product3  East      652.000000
          North     571.333333
          South     524.000000
          West      445.833333
Name: Sales, dtype: float64


In [43]:
# Aggregate multiple functions

grouped_agg1 = df.groupby(['Product','Region'])['Value'].agg(['mean','sum','count'])
grouped_agg1

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum,count
Product,Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Product1,East,41.714286,292.0,7
Product1,North,4.5,9.0,2
Product1,South,50.0,100.0,2
Product1,West,82.0,246.0,3
Product2,East,28.0,56.0,2
Product2,North,63.5,127.0,2
Product2,South,60.333333,181.0,3
Product2,West,53.5,428.0,8
Product3,East,50.5,202.0,4
Product3,North,40.6,203.0,5


In [42]:
grouped_agg2 = df.groupby(['Region'])['Value'].agg(['mean','sum','count'])
grouped_agg2

Unnamed: 0_level_0,mean,sum,count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,42.307692,550.0,13
North,37.666667,339.0,9
South,62.0,496.0,8
West,61.588235,1047.0,17


## Creating DataFrames

- **df1 = pd.DataFrame({'Key':['A','B','C'], 'Value1': [1, 2, 3]})**
    - DataFrames exist in Key-Value Pairs

## Merging and Joining DataFrames

- **pd.merge(df1, df2, on = "Key", how = "inner")**
    - Inner Join
    - Merging by the column of keys
    - Only rows of keys common to both DataFrames get merged
- **pd.merge(df1, df2, on = "Key", how = "outer")**
    - Outer Join
    - All rows merged
- **pd.merge(df1, df2, on = "Key", how = "left")**
    - Left Outer Join / Left Join
    - Priority to df1 which is on the left
    - All common rows + all rows of df1
- **pd.merge(df1, df2, on = "Key", how = "right")**
    - Right Outer Join / Right Join
    - Priority to df2 which is on the right
    - All common rows + all rows of df2

In [44]:
# Merging and Joining Dataframes
# Create Sample Dataframes

df1 = pd.DataFrame({'Key':['A','B','C'], 'Value1': [1, 2, 3]})
df2 = pd.DataFrame({'Key':['A','B','D'], 'Value2': [4, 5, 6]})

In [45]:
df1

Unnamed: 0,Key,Value1
0,A,1
1,B,2
2,C,3


In [46]:
df2

Unnamed: 0,Key,Value2
0,A,4
1,B,5
2,D,6


In [47]:
# Merge Dataframes on the 'Key Columns'

pd.merge(df1, df2, on = "Key", how = "inner")

Unnamed: 0,Key,Value1,Value2
0,A,1,4
1,B,2,5


In [48]:
pd.merge(df1, df2, on = "Key", how = "outer")

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [50]:
# Left Outer Join - df1 gets more priority since it is on the left

pd.merge(df1, df2, on = "Key", how = "left")

Unnamed: 0,Key,Value1,Value2
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [52]:
# Right Outer Join - df2 gets more priority since it is on the right

pd.merge(df1, df2, on = "Key", how = "right")

Unnamed: 0,Key,Value1,Value2
0,A,1.0,4
1,B,2.0,5
2,D,,6
