In [3]:
import numpy as np
import pandas as pd

## Pandas Aggregate Function

In [1]:
# Syntax
# df.aggregate(func, axis=0, *args, **kwargs)

#### Apply Single Aggregate Function

In [3]:
data = {
    'Category':['A', 'A', 'B', 'B', 'A', 'B'],
    'Value':[10, 15, 20, 25, 30, 35],
}

In [5]:
df = pd.DataFrame(data)

In [6]:
# calculate total sum of the Value column
total_sum = df['Value'].aggregate('sum')
total_sum

135

In [7]:
# calculate mean of the Value column
average_value = df['Value'].aggregate('mean')
average_value

22.5

In [8]:
# calculate max of the value column
max_value = df['Value'].aggregate('max')
max_value

35

#### Apply Multiple Aggregate Functions in Pandas

In [9]:
data = {
    'Category':['A', 'A', 'B', 'B', 'A', 'B'],
    'Value':[10, 15, 20, 25, 30, 35],
}

In [11]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Category,Value
0,A,10
1,A,15
2,B,20
3,B,25
4,A,30


In [13]:
# Applying Multiple aggregation functions to a single column
df.groupby('Category')['Value'].aggregate(['sum', 'mean', 'max', 'min' ])

Unnamed: 0_level_0,sum,mean,max,min
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,55,18.333333,30,10
B,80,26.666667,35,20


In [14]:
# Apply Different Aggregation Functions
data = {
    'Category':['A', 'A', 'B', 'B', 'A', 'B'],
    'Value1':[10, 15, 20, 25, 30, 35],
    'Value2':[5, 8, 12, 15, 18, 21],
}

In [15]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Category,Value1,Value2
0,A,10,5
1,A,15,8
2,B,20,12
3,B,25,15
4,A,30,18


In [16]:
agg_funcs = {
    'Value1':'sum',
    'Value2':['mean', 'max']
}

In [17]:
df.groupby('Category').aggregate(agg_funcs)

Unnamed: 0_level_0,Value1,Value2,Value2
Unnamed: 0_level_1,sum,mean,max
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,55,10.333333,18
B,80,16.0,21


## Python Pandas Filtering

We can filter the data in Pandas in two main ways:

By column names (Labels) <br>
By the actual data inside (Values) <br>

#### Filter Data By Labels

In [18]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Department':['HR', 'Marketing', 'Marketing', 'IT'],
    'Salary':[50000, 60000, 55000, 70000],
}

In [19]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


In [20]:
# use filter() method to select columns based on a condition
filtered_df = df.filter(items = ['Name', 'Salary'])
filtered_df

Unnamed: 0,Name,Salary
0,Alice,50000
1,Bob,60000
2,Charlie,55000
3,David,70000


In [21]:
fil_df = df[['Name', 'Salary']]

In [22]:
fil_df

Unnamed: 0,Name,Salary
0,Alice,50000
1,Bob,60000
2,Charlie,55000
3,David,70000


#### Filter Data By Values - Logical Operators

In [23]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Department':['HR', 'Marketing', 'Marketing', 'IT'],
    'Salary':[50000, 60000, 55000, 70000]
}

In [24]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


In [25]:
# Use Logical operators to filter
filtered_df = df[df['Salary'] > 50000]
filtered_df

Unnamed: 0,Name,Department,Salary
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


#### The isin() Method

In [27]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Department':['HR', 'Marketing', 'Marketing', 'IT'],
    'Salary':[50000, 60000, 55000, 70000],
}

In [28]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


In [29]:
filtered_df = df[df['Department'].isin(['HR','IT'])]
filtered_df

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
3,David,IT,70000


#### The str Accessor

In [30]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Department':['HR', 'Marketing', 'Marketing', 'IT'],
    'Salary':[50000, 60000, 55000, 70000],
}

In [31]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


In [32]:
filtered_df = df[df['Department'].str.contains('Market')]
filtered_df

Unnamed: 0,Name,Department,Salary
1,Bob,Marketing,60000
2,Charlie,Marketing,55000


#### The query() Method

In [33]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Department':['HR', 'Marketing', 'Marketing', 'IT'],
    'Salary':[50000, 60000, 55000, 70000],
}

In [34]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000
1,Bob,Marketing,60000
2,Charlie,Marketing,55000
3,David,IT,70000


In [35]:
filtered_df = df.query('Salary > 55000 and Department == "Marketing"')
filtered_df

Unnamed: 0,Name,Department,Salary
1,Bob,Marketing,60000


## Pandas Sort

#### Sort DataFrame in Pandas - we can use the sort_values() function to sort a DataFrame

In [36]:
data = {
    'Name':['Alice', 'Bob', 'Charlie'],
    'Age':[28, 22, 25],
}

In [37]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Alice,28
1,Bob,22
2,Charlie,25


In [38]:
# Sort data frame by age in ascending order
sorted_df = df.sort_values(by='Age')
sorted_df

Unnamed: 0,Name,Age
1,Bob,22
2,Charlie,25
0,Alice,28


In [39]:
# To sort values in descending order
sorted_df_desc = df.sort_values(by='Age', ascending=False)
sorted_df_desc

Unnamed: 0,Name,Age
0,Alice,28
2,Charlie,25
1,Bob,22


#### Sort Pandas DataFrame by Multiple Columns

In [40]:
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'David'],
    'Age':[25, 22, 30, 22],
    'Score':[85, 90, 75, 80],
}

In [41]:
df = pd.DataFrame(data)

In [42]:
# 1. Sort DataFrame by 'Age' and then by 'Score' (Both in ascending order)
df1 = df.sort_values(by = ['Age', 'Score'])
df1

Unnamed: 0,Name,Age,Score
3,David,22,80
1,Bob,22,90
0,Alice,25,85
2,Charlie,30,75


In [43]:
# 2. Sort DataFrame by 'Age' in ascending order, and then by 'Score' in descending order
df2 = df.sort_values(by = ['Age', 'Score'], ascending=[True, False])
df2

Unnamed: 0,Name,Age,Score
1,Bob,22,90
3,David,22,80
0,Alice,25,85
2,Charlie,30,75


#### Sort Pandas Series - we can use the sort_values() function to sort a Series

In [44]:
ages = pd.Series([28, 22, 25], name = 'Age')
ages

0    28
1    22
2    25
Name: Age, dtype: int64

In [45]:
sort_ages = ages.sort_values()
sort_ages

1    22
2    25
0    28
Name: Age, dtype: int64

In [46]:
sort_ages = ages.sort_values(ascending = False)
sort_ages

0    28
2    25
1    22
Name: Age, dtype: int64

#### #index Sort Pandas DataFrame Using sort_index()



In [47]:
data = {
    'Name':['Alice', 'Bob', 'Charlie'],
    'Age':[28, 22, 25],
}

In [48]:
# Create a Dataframe with a non-sequential index
df = pd.DataFrame(data, index = [2, 0, 1])
df

Unnamed: 0,Name,Age
2,Alice,28
0,Bob,22
1,Charlie,25


In [49]:
sorted_df = df.sort_index()
sorted_df

Unnamed: 0,Name,Age
0,Bob,22
1,Charlie,25
2,Alice,28


## Pandas Correlation

Correlation is a statistical concept that quantifies the degree to which two variables are related to each other. <br>

Correlation can be calculated in Pandas using the corr() function.

In [1]:
data = {
    'Temperature':[22, 25, 32, 28, 30],
    'Ice_Cream_Sales':[105, 120, 135, 130, 125],
}

In [4]:
df = pd.DataFrame(data)
df

Unnamed: 0,Temperature,Ice_Cream_Sales
0,22,105
1,25,120
2,32,135
3,28,130
4,30,125


In [5]:
# Calculate the correlation matrix
print(df.corr())

                 Temperature  Ice_Cream_Sales
Temperature         1.000000         0.923401
Ice_Cream_Sales     0.923401         1.000000


The output is a correlation matrix that displays the correlation coefficients between all pairs of columns in the dataframe. In this case, there are only two columns, so the matrix is 2x2. <br>

Here, the correlation coefficient between Temperature and Ice_Cream_Sales is 0.923401, which is positive. This indicates that as the temperature increases, the ice cream sales also increase. <br>

The coefficient value of 1.000000 along the diagonal represents the correlation of each column with itself.

#### Correlation Between Two Columns

In [6]:
data = {
    'Temperature':[22, 25, 32, 28, 30],
    'Ice_Cream_Sales':[105, 120, 135, 130, 125],
}

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,Temperature,Ice_Cream_Sales
0,22,105
1,25,120
2,32,135
3,28,130
4,30,125


In [8]:
correlation = df['Temperature'].corr(df['Ice_Cream_Sales'])

In [9]:
correlation

0.9234007664064656

#### Correlation when Missing Values are present

In [10]:
data = {
    'Temperature':[22, 25, 32, 28, 30],
    'Coffee_Sales':[158, 145, np.nan, np.nan, 140],
}

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,Temperature,Coffee_Sales
0,22,158.0
1,25,145.0
2,32,
3,28,
4,30,140.0


In [12]:
# With Nan Values
corr1 = df['Temperature'].corr(df['Coffee_Sales'])
corr1

-0.923177938058926

In [13]:
# Remove missing values
df.dropna(inplace=True)
df

Unnamed: 0,Temperature,Coffee_Sales
0,22,158.0
1,25,145.0
4,30,140.0


In [14]:
# Without Nan Values
corr2 = df['Temperature'].corr(df['Coffee_Sales'])
corr2

-0.923177938058926

#### Correlation Methods in Pandas

We can calculate correlation using three different methods in Pandas: <br>

* Pearson Method (Default): evaluates the linear relationship between two continuous variables <br>
* Kendall Method: measures the ordinal association between two measured quantities <br>
* Spearman Method: evaluates the monotonic relationship between two continuous or ordinal variables <br>
By default, corr() computes the Pearson correlation coefficient, which measures the linear relationship between two variables.

In [15]:
data = {
    'Temperature':[22, 25, 32, 28, 30],
    'Ice_Cream_Sales':[105, 120, 135, 130, 125],
}

In [16]:
df = pd.DataFrame(data)
df

Unnamed: 0,Temperature,Ice_Cream_Sales
0,22,105
1,25,120
2,32,135
3,28,130
4,30,125


In [18]:
# Calculate different correlation coefficients
pearson = df['Temperature'].corr(df['Ice_Cream_Sales'])
kendall = df['Temperature'].corr(df['Ice_Cream_Sales'], method = 'kendall')
spearman = df['Temperature'].corr(df['Ice_Cream_Sales'], method = 'spearman')

In [19]:
print(pearson)
print(kendall)
print(spearman)

0.9234007664064656
0.7999999999999999
0.8999999999999998
