# Pandas Operations

- In this file you will find all the basic and common opearation performed with the help pf Pandas.

In [1]:
import pandas as pd
import numpy as np

### Create a Series

- There are multiple ways by which we can create the Series
1. using list
2. using numpy array
3. using dictionary
4. from scalar value
5. from a range

In [3]:
# Using list
data_list = [1,2,3,4,5]
series_from_list = pd.Series(data_list)
print(series_from_list)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [4]:
# Using Numpy array
numpy_array = np.array([1,3,2,4,5,5])
series_from_array = pd.Series(numpy_array)
print(series_from_array)

0    1
1    3
2    2
3    4
4    5
5    5
dtype: int32


In [5]:
# From Dictionary
data_dict = {'a':1,'b':2,'c':3}
series_from_dict = pd.Series(data_dict)
print(series_from_dict)

a    1
b    2
c    3
dtype: int64


In [6]:
# From scalar value
scalar_value = 5
index_label = ['a','b','c','d','e']
series_from_scalar = pd.Series(scalar_value, index=index_label)
print(series_from_scalar)

a    5
b    5
c    5
d    5
e    5
dtype: int64


In [7]:
# From a Range
start = 0
stop = 50
step = 10
series_range = pd.Series(pd.RangeIndex(start,stop,step))
print(series_range)

0     0
1    10
2    20
3    30
4    40
dtype: int64


### Create the DataFrame

- There are multiple ways to create the `DataFrame`
1. From Dictionary of list or arrays
2. From lists of dictionary
3. From list of lists or arrays with column and Index Labels


In [8]:
# From dictionary of lists or arrays
data = {'Name': ['John', 'Emma', 'James', 'Olivia', 'William'],
        'Age': [25, 30, 35, 40, 45],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male']}
df_from_dict = pd.DataFrame(data)
print(df_from_dict)

      Name  Age  Gender
0     John   25    Male
1     Emma   30  Female
2    James   35    Male
3   Olivia   40  Female
4  William   45    Male


In [9]:
# From list of dictionary

data = [{'Name': 'John', 'Age': 25, 'Gender': 'Male'},
        {'Name': 'Emma', 'Age': 30, 'Gender': 'Female'},
        {'Name': 'James', 'Age': 35, 'Gender': 'Male'},
        {'Name': 'Olivia', 'Age': 40, 'Gender': 'Female'},
        {'Name': 'William', 'Age': 45, 'Gender': 'Male'}]
df_from_list_of_dicts = pd.DataFrame(data)
print(df_from_list_of_dicts)

      Name  Age  Gender
0     John   25    Male
1     Emma   30  Female
2    James   35    Male
3   Olivia   40  Female
4  William   45    Male


In [10]:
# From list of lists or arrays with column and Index labels

data = [['John', 25, 'Male'],
        ['Emma', 30, 'Female'],
        ['James', 35, 'Male'],
        ['Olivia', 40, 'Female'],
        ['William', 45, 'Male']]
columns = ['Name', 'Age', 'Gender']
index = ['A', 'B', 'C', 'D', 'E']
df_from_list_of_lists = pd.DataFrame(data, columns=columns, index=index)
print(df_from_list_of_lists)


      Name  Age  Gender
A     John   25    Male
B     Emma   30  Female
C    James   35    Male
D   Olivia   40  Female
E  William   45    Male


### Add rows in the existing Dataframe
- There are multiple ways to create the Dataframe
1. using appending rows to dataframe
    - As this append function is going to be removed from the latest version of the pandas library, it is suggested that no to use this function
2. Using loc method
2. using concat

- **append**

In [27]:
# We can add multiple rows by using this append function

# Existing DataFrame
data = {'Name': ['John', 'Emma', 'James'],
        'Age': [25, 30, 35],
        'Gender': ['Male', 'Female', 'Male']}
df = pd.DataFrame(data)

# Dictionary representing the new row
new_row = {'Name': 'Olivia', 'Age': 40, 'Gender': 'Female'}

# Appending the new row
df = df.append(new_row, ignore_index=True)
print(df)

     Name  Age  Gender
0    John   25    Male
1    Emma   30  Female
2   James   35    Male
3  Olivia   40  Female


  df = df.append(new_row, ignore_index=True)


- **iloc**

In [28]:
import pandas as pd

# Existing DataFrame
data = {'Name': ['John', 'Emma', 'James'],
        'Age': [25, 30, 35],
        'Gender': ['Male', 'Female', 'Male']}
df = pd.DataFrame(data)

# Adding a row using loc
df.loc[3] = ['Olivia', 40, 'Female']
print(df)


     Name  Age  Gender
0    John   25    Male
1    Emma   30  Female
2   James   35    Male
3  Olivia   40  Female


- **using concat**

In [29]:
# Using the concat function

# Existing DataFrame
data = {'Name': ['John', 'Emma', 'James'],
        'Age': [25, 30, 35],
        'Gender': ['Male', 'Female', 'Male']}
df = pd.DataFrame(data)

# Dictionary representing the new row
new_row = {'Name': ['Olivia'], 'Age': [40], 'Gender': ['Female']}
temp = pd.DataFrame(new_row)

# Concating the dataframe
df = pd.concat([df, temp])
print(df)

     Name  Age  Gender
0    John   25    Male
1    Emma   30  Female
2   James   35    Male
0  Olivia   40  Female


### Data Loading and Saving

- Load the SQL data

In [None]:
# Laoding the SQL data into pandas dataframe
import pandas as pd
import sqlite3

# Establishing a connection to the SQL database
conn = sqlite3.connect('your_database.db')  # Replace 'your_database.db' with the path to your SQLite database
# If you are using a different SQL database like MySQL, PostgreSQL, etc.,
# you would need to use appropriate connection parameters and libraries for that specific database.

# Writing your SQL query
query = "SELECT * FROM your_table;"  # Replace 'your_table' with the name of the table you want to query

# Loading data into a DataFrame
df = pd.read_sql(query, conn)

# Closing the database connection
conn.close()

# Displaying the DataFrame
print(df)


- Load the html file

In [None]:
import pandas as pd

# Reading HTML file into a list of DataFrames
dfs = pd.read_html('your_file.html')  # Replace 'your_file.html' with the path to your HTML file

# Accessing the DataFrame(s) from the list
for i, df in enumerate(dfs):
    print(f"DataFrame {i + 1}:")
    print(df)
    print()


- Load the json file

In [None]:
import pandas as pd

# Reading JSON file into a DataFrame
df = pd.read_json('your_file.json')  # Replace 'your_file.json' with the path to your JSON file

# Displaying the DataFrame
print(df)

### Reshape the data

- Pivot table

**pivot(): Reshape data based on column values.**

In [30]:
# Creating a DataFrame
data = {'Date': ['2022-01-01', '2022-01-02', '2022-01-03'],
        'City': ['New York', 'Los Angeles', 'Chicago'],
        'Temperature': [32, 75, 50]}
df = pd.DataFrame(data)

# Reshaping using pivot
pivot_df = df.pivot(index='Date', columns='City', values='Temperature')
print(pivot_df)

City        Chicago  Los Angeles  New York
Date                                      
2022-01-01      NaN          NaN      32.0
2022-01-02      NaN         75.0       NaN
2022-01-03     50.0          NaN       NaN


**pivot_table(): Reshape data while handling duplicate entries by aggregating values.**

In [32]:
# Using pivot_table
pivot_table_df = df.pivot_table(index='Date', columns='City', values='Temperature', aggfunc='mean')
print(pivot_table_df)

City        Chicago  Los Angeles  New York
Date                                      
2022-01-01      NaN          NaN      32.0
2022-01-02      NaN         75.0       NaN
2022-01-03     50.0          NaN       NaN


**stack(): Reshape data by pivoting the innermost level of column labels.**

In [33]:
# Using stack
stacked_df = pivot_df.stack()
print(stacked_df)

Date        City       
2022-01-01  New York       32.0
2022-01-02  Los Angeles    75.0
2022-01-03  Chicago        50.0
dtype: float64


**unstack(): Reverse the operation of stack, pivoting the innermost level of row labels**

In [35]:
# Using unstack
unstacked_df = stacked_df.unstack()
print(unstacked_df)

City        Chicago  Los Angeles  New York
Date                                      
2022-01-01      NaN          NaN      32.0
2022-01-02      NaN         75.0       NaN
2022-01-03     50.0          NaN       NaN


### Concat

In [37]:
# Creating two DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3],
                    'B': [4, 5, 6]})
df2 = pd.DataFrame({'A': [7, 8, 9],
                    'B': [10, 11, 12]})

# Concatenating along rows
result = pd.concat([df1, df2], axis=0)
print(result)

   A   B
0  1   4
1  2   5
2  3   6
0  7  10
1  8  11
2  9  12


### Merge

- Merge function is based on the column's values. It allows you to merge the dataframe based on common columns or columns with different names.

- Inner Merge : An inner merge returns only the rows that have matching values in both DataFrames.

In [39]:
# Creating two DataFrames
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'],
                    'value': [1, 2, 3, 4]})
df2 = pd.DataFrame({'key': ['B', 'D', 'E', 'F'],
                    'value': [5, 6, 7, 8]})

# Performing inner merge
inner_merge = pd.merge(df1, df2, on='key', how='inner')
print(inner_merge)


  key  value_x  value_y
0   B        2        5
1   D        4        6


- Left merge : A left merge returns all the rows from the left DataFrame and the matched rows from the right DataFrame. If there is no match, NaN values are filled.

In [40]:
# Performing left merge
left_merge = pd.merge(df1, df2, on='key', how='left')
print(left_merge)

  key  value_x  value_y
0   A        1      NaN
1   B        2      5.0
2   C        3      NaN
3   D        4      6.0


- Right Merge : A right merge returns all the rows from the right DataFrame and the matched rows from the left DataFrame. If there is no match, NaN values are filled.

In [41]:
# Performing right merge
right_merge = pd.merge(df1, df2, on='key', how='right')
print(right_merge)


  key  value_x  value_y
0   B      2.0        5
1   D      4.0        6
2   E      NaN        7
3   F      NaN        8


- Outer Merge : An outer merge returns all the rows from both DataFrames and fills in NaN values for missing matches.

In [43]:
# Performing outer merge
outer_merge = pd.merge(df1, df2, on='key', how='outer')
print(outer_merge)


  key  value_x  value_y
0   A      1.0      NaN
1   B      2.0      5.0
2   C      3.0      NaN
3   D      4.0      6.0
4   E      NaN      7.0
5   F      NaN      8.0


### Join

- Join is based on the indexes of the dataframe. It allows you to join the Dataframes based on theie Indexes

- **Left Join**: Returns all the rows from the left DataFrame and the matched rows from the right DataFrame. If there is no match, NaN values are filled.

In [3]:
import pandas as pd

# Creating two DataFrames
df1 = pd.DataFrame({'A': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'B': [4, 5, 6]}, index=['b', 'c', 'd'])

# Left join
left_join = df1.join(df2, how='left')
print("Left join:")
print(left_join)

Left join:
   A    B
a  1  NaN
b  2  4.0
c  3  5.0


- **Right Join**: Returns all the rows from the right DataFrame and the matched rows from the left DataFrame. If there is no match, NaN values are filled.

In [4]:
# Right join
right_join = df1.join(df2, how='right')
print("\nRight join:")
print(right_join)


Right join:
     A  B
b  2.0  4
c  3.0  5
d  NaN  6


- **Inner Join**: Returns only the rows that have matching index values in both DataFrames.

In [5]:
# Inner join
inner_join = df1.join(df2, how='inner')
print("\nInner join:")
print(inner_join)


Inner join:
   A  B
b  2  4
c  3  5


- **Outer Join** : Returns all the rows from both DataFrames and fills in NaN values for missing matches.

In [6]:
# Outer join
outer_join = df1.join(df2, how='outer')
print("\nOuter join:")
print(outer_join)


Outer join:
     A    B
a  1.0  NaN
b  2.0  4.0
c  3.0  5.0
d  NaN  6.0


### Selection of dataframe using iloc and loc

- **iloc**:
    - iloc is used for integer-based indexing. 
    - It is primarily used when you want to access DataFrame elements by their integer position.
    - You specify the row and column indices using integers, starting from 0.
    - The syntax for iloc is df.iloc[row_index, column_index].

In [11]:
import pandas as pd

# Creating a sample DataFrame
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}
df = pd.DataFrame(data, index=['X', 'Y', 'Z'])

# Using iloc to access data by integer position
print("Using iloc:")
print(df.iloc[0, 1])  # Accessing element at row 0, column 1 (value: 4)
print(df.iloc[1])     # Accessing entire row at index 1 (row 'Y')
print(df.iloc[:, 2])  # Accessing entire column at index 2 (column 'C')

Using iloc:
4
A    2
B    5
C    8
Name: Y, dtype: int64
X    7
Y    8
Z    9
Name: C, dtype: int64


- **loc**:

    - loc is used for label-based indexing. It is primarily used when you want to access DataFrame elements by their row and column labels.
    - You specify the row and column labels directly.
    - The syntax for loc is df.loc[row_label, column_label].

In [12]:
# Using loc to access data by label
print("\nUsing loc:")
print(df.loc['X', 'B'])  # Accessing element at row 'X', column 'B' (value: 4)
print(df.loc['Y'])     # Accessing entire row with label 'Y'
print(df.loc[:, 'C'])  # Accessing entire column with label 'C'


Using loc:
4
A    2
B    5
C    8
Name: Y, dtype: int64
X    7
Y    8
Z    9
Name: C, dtype: int64


### Filtering the DataFrame

- **Boolean Indexing** : Use boolean expressions to filter rows based on conditions.

In [15]:
import pandas as pd

# Creating a sample DataFrame
data = {'A': [1, 2, 3, 4, 5],
        'B': ['a', 'b', 'c', 'd', 'e']}
df = pd.DataFrame(data)

# Filtering rows where values in column 'A' are greater than 2
filtered_df = df[df['A'] > 2]
print(filtered_df)


   A  B
2  3  c
3  4  d
4  5  e


- **Query Method** : Use the query() method to filter rows based on a query string.

In [16]:
# Filtering rows where values in column 'A' are greater than 2 using query method
filtered_df = df.query('A > 2')
print(filtered_df)

   A  B
2  3  c
3  4  d
4  5  e


- **loc Method** : Use the loc[] method to filter rows based on labels or boolean arrays.

In [18]:
# Filtering rows where values in column 'A' are greater than 2 using loc method
filtered_df = df.loc[df['A'] > 2]
print(filtered_df)

   A  B
2  3  c
3  4  d
4  5  e


- **isin Method**: Use the isin() method to filter rows based on whether values are in a list.

In [20]:
# Filtering rows where values in column 'B' are in a list
filtered_df = df[df['B'].isin(['a', 'c', 'e'])]
print(filtered_df)

   A  B
0  1  a
2  3  c
4  5  e


- **Filtering Columns** : Use column indexing or the filter() method to filter columns.

In [24]:
# Filtering columns based on column names
filtered_columns = df[['A']]
print( filtered_columns)
print()
# Filtering columns using the filter method
filtered_columns = df.filter(items=['B'])
print(filtered_columns)

   A
0  1
1  2
2  3
3  4
4  5

   B
0  a
1  b
2  c
3  d
4  e


### Sorting Methods

- **sort_values()**: Use the sort_values() method to sort the DataFrame based on the values in one or more columns.

In [25]:
# Creating a sample DataFrame
data = {'A': [3, 2, 1, 4, 5],
        'B': ['c', 'b', 'a', 'd', 'e']}
df = pd.DataFrame(data)

# Sorting the DataFrame by values in column 'A' (ascending order)
sorted_df = df.sort_values(by='A')
print("Sorted by column 'A' ascending:")
print(sorted_df)
print()

# Sorting the DataFrame by values in column 'A' (descending order)
sorted_df_desc = df.sort_values(by='A', ascending=False)
print("\nSorted by column 'A' descending:")
print(sorted_df_desc)

Sorted by column 'A' ascending:
   A  B
2  1  a
1  2  b
0  3  c
3  4  d
4  5  e


Sorted by column 'A' descending:
   A  B
4  5  e
3  4  d
0  3  c
1  2  b
2  1  a


- Sort the DataFrame considering multiple columns

In [27]:
import pandas as pd

# Creating a sample DataFrame
data = {'A': [3, 2, 1, 2, 5],
        'B': ['c', 'b', 'a', 'd', 'e'],
        'C': [10, 20, 15, 25, 30]}
df = pd.DataFrame(data)

# Sorting the DataFrame by values in column 'A' (ascending) and then by values in column 'B' (descending)
sorted_df = df.sort_values(by=['A', 'B'], ascending=[True, False])
print("Sorted by columns 'A' ascending and 'B' descending:")
print(sorted_df)


Sorted by columns 'A' ascending and 'B' descending:
   A  B   C
2  1  a  15
3  2  d  25
1  2  b  20
0  3  c  10
4  5  e  30


- **sort_index()** : Use the sort_index() method to sort the DataFrame based on its index.

In [26]:
# Sorting the DataFrame by index (ascending order)
sorted_index_df = df.sort_index()
print("\nSorted by index ascending:")
print(sorted_index_df)

# Sorting the DataFrame by index (descending order)
sorted_index_df_desc = df.sort_index(ascending=False)
print("\nSorted by index descending:")
print(sorted_index_df_desc)


Sorted by index ascending:
   A  B
0  3  c
1  2  b
2  1  a
3  4  d
4  5  e

Sorted by index descending:
   A  B
4  5  e
3  4  d
2  1  a
1  2  b
0  3  c


### Detect NA values

- **isna() / isnull()** : These methods return a DataFrame of boolean values indicating whether each element is a missing value or not. You can then use methods like any() or sum() to identify columns with missing values.

In [28]:
import pandas as pd

# Creating a sample DataFrame with missing values
data = {'A': [1, 2, None, 4],
        'B': [5, None, 7, 8]}
df = pd.DataFrame(data)

# Check for missing values in each column
na_columns = df.isna().any()
print(na_columns)


A    True
B    True
dtype: bool


- **info()**: The info() method provides a concise summary of the DataFrame, including the count of non-null values in each column. Columns with missing values will have a lower count than the total number of rows.

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       3 non-null      float64
 1   B       3 non-null      float64
dtypes: float64(2)
memory usage: 192.0 bytes


- **describe()**: The describe() method provides summary statistics for numerical columns, including the count of non-null values. Columns with missing values will have a count lower than the total number of rows.

In [30]:
df.describe()

Unnamed: 0,A,B
count,3.0,3.0
mean,2.333333,6.666667
std,1.527525,1.527525
min,1.0,5.0
25%,1.5,6.0
50%,2.0,7.0
75%,3.0,7.5
max,4.0,8.0


-  **notna() / notnull()**: These methods are the complement of isna() / isnull(), returning the opposite boolean values. You can use them to identify columns without missing values.

In [31]:
# Check for non-missing values in each column
non_na_columns = df.notna().all()
print(non_na_columns)

A    False
B    False
dtype: bool


### Dropping NA values

- **dropna()**: The dropna() method is used to remove rows or columns with missing values from the DataFrame.

In [2]:
import pandas as pd

# Creating a sample DataFrame with missing values
data = {'A': [1, None, 3],
        'B': [4, 5, None]}
df = pd.DataFrame(data)

# Remove rows with any NA values
cleaned_df = df.dropna()
print(cleaned_df)


     A    B
0  1.0  4.0


### Fill Na Values

- **fillna()**: The fillna() method is used to fill missing values with specified values.

    - Filling missing values with a constant:

In [6]:
import pandas as pd

# Creating a sample DataFrame with missing values
data = {'A': [1, None, 3],
        'B': [4, 5, None]}
df = pd.DataFrame(data)

# Fill missing values with a constant (e.g., 0)
filled_df = df.fillna(0)
print(filled_df)


     A    B
0  1.0  4.0
1  0.0  5.0
2  3.0  0.0


    - Filling missing values with the mean of the column:

In [7]:
# Fill missing values with the mean of the column
filled_df = df.fillna(df.mean())
print(filled_df)


     A    B
0  1.0  4.0
1  2.0  5.0
2  3.0  4.5


- **ffill() / bfill()**: The ffill() method fills missing values using the previous valid value (forward fill), while the bfill() method fills missing values using the next valid value (backward fill).

In [8]:
# Forward fill missing values
filled_df_ffill = df.ffill()
print(filled_df_ffill)

# Backward fill missing values
filled_df_bfill = df.bfill()
print(filled_df_bfill)


     A    B
0  1.0  4.0
1  1.0  5.0
2  3.0  5.0
     A    B
0  1.0  4.0
1  3.0  5.0
2  3.0  NaN


- **interpolate()**: The interpolate() method fills missing values by interpolating between existing values.

In [9]:
# Interpolate missing values
interpolated_df = df.interpolate()
print(interpolated_df)

     A    B
0  1.0  4.0
1  2.0  5.0
2  3.0  5.0


### Groupby

In [12]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Alice', 'Bob', 'Charlie', 'David'],
    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math', 'Math', 'Science', 'Science', 'Science', 'Science'],
    'Grade': [85, 90, 75, 80, 95, 70, 88, 92, 78, 83]
}

df = pd.DataFrame(data)

# Group the DataFrame by 'Subject'
grouped_df = df.groupby('Subject').mean()

print(grouped_df)

         Grade
Subject       
Math     82.50
Science  85.25


- **agg() Method**: Used to compute aggregations (summary statistics) for each group.

In [13]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Alice', 'Bob', 'Charlie', 'David'],
    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math', 'Math', 'Science', 'Science', 'Science', 'Science'],
    'Grade': [85, 90, 75, 80, 95, 70, 88, 92, 78, 83]
}

df = pd.DataFrame(data)

# Group the DataFrame by 'Subject' and calculate multiple aggregations
aggregated_data = df.groupby('Subject').agg({
    'Grade': ['mean', 'max', 'min']
})

print(aggregated_data)


         Grade        
          mean max min
Subject               
Math     82.50  95  70
Science  85.25  92  78


In [2]:
import pandas as pd
# Create a sample DataFrame
data = {'Category': ['A', 'B', 'A', 'B', 'A'],
        'Value': [10, 20, 15, 25, 30]}
df = pd.DataFrame(data)

# Define a custom aggregation function
def my_agg_func(x):
    return {'Mean': x.mean(), 'Sum': x.sum()}

# Group by 'Category' column and apply custom aggregation function
grouped = df.groupby('Category').agg(my_agg_func)
print(grouped)


                                            Value
Category                                         
A         {'Mean': 18.333333333333332, 'Sum': 55}
B                       {'Mean': 22.5, 'Sum': 45}


In [14]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Alice', 'Bob', 'Charlie', 'David'],
    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math', 'Math', 'Science', 'Science', 'Science', 'Science'],
    'Gender': ['F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F'],
    'Grade': [85, 90, 75, 80, 95, 70, 88, 92, 78, 83]
}

df = pd.DataFrame(data)

# Group the DataFrame by 'Subject' and 'Gender' and calculate multiple aggregations
aggregated_data = df.groupby(['Subject', 'Gender']).agg({
    'Grade': ['mean', 'max', 'min']
})

print(aggregated_data)


                    Grade        
                     mean max min
Subject Gender                   
Math    F       86.666667  95  80
        M       78.333333  90  70
Science F       85.500000  88  83
        M       85.000000  92  78


- **transform() Method**: Used to perform transformations on each group independently.

In [17]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Alice', 'Bob', 'Charlie', 'David'],
    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math', 'Math', 'Science', 'Science', 'Science', 'Science'],
    'Gender': ['F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F'],
    'Grade': [85, 90, 75, 80, 95, 70, 88, 92, 78, 83]
}

df = pd.DataFrame(data)

# Define a custom function for standardization
def standardize(x):
    return (x - x.mean()) / x.std()

# Group the DataFrame by 'Subject' and apply the standardization transformation
standardized_grades = df.groupby('Subject')['Grade'].transform(standardize)

# Add the standardized grades as a new column to the original DataFrame
df['Standardized Grade'] = standardized_grades

print(df)


   Student  Subject Gender  Grade  Standardized Grade
0    Alice     Math      F     85            0.267261
1      Bob     Math      M     90            0.801784
2  Charlie     Math      M     75           -0.801784
3    David     Math      F     80           -0.267261
4     Emma     Math      F     95            1.336306
5    Frank     Math      M     70           -1.336306
6    Alice  Science      F     88            0.452607
7      Bob  Science      M     92            1.110945
8  Charlie  Science      M     78           -1.193237
9    David  Science      F     83           -0.370315


- **filter() Method**: Used to filter out groups based on some condition.

In [18]:
import pandas as pd

# Create a sample DataFrame
data = {
    'Student': ['Alice', 'Bob', 'Charlie', 'David', 'Emma', 'Frank', 'Alice', 'Bob', 'Charlie', 'David'],
    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math', 'Math', 'Science', 'Science', 'Science', 'Science'],
    'Gender': ['F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F'],
    'Grade': [85, 90, 75, 80, 95, 70, 88, 92, 78, 83]
}

df = pd.DataFrame(data)

# Define a custom function to filter groups
def filter_low_average(group):
    return group['Grade'].mean() >= 85

# Group the DataFrame by 'Subject' and filter groups based on average grade
filtered_groups = df.groupby('Subject').filter(filter_low_average)

print(filtered_groups)


   Student  Subject Gender  Grade
6    Alice  Science      F     88
7      Bob  Science      M     92
8  Charlie  Science      M     78
9    David  Science      F     83


### Date Generation

In Pandas, you can generate date sequences or ranges using the pd.date_range() function. This function allows you to create a range of dates based on various parameters, such as start date, end date, frequency, and number of periods.

- Generating a Date Range with Start and End Dates:

In [19]:
import pandas as pd

# Generate a date range from January 1, 2022 to January 10, 2022
date_range = pd.date_range(start='2022-01-01', end='2022-01-10')
print(date_range)


DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10'],
              dtype='datetime64[ns]', freq='D')


- Generating a Date Range with a Specified Frequency (e.g., daily, monthly):

In [20]:
# Generate a date range with daily frequency from January 1, 2022 to January 10, 2022
daily_date_range = pd.date_range(start='2022-01-01', end='2022-01-10', freq='D')
print(daily_date_range)

# Generate a date range with monthly frequency for the year 2022
monthly_date_range = pd.date_range(start='2022-01-01', end='2022-12-31', freq='M')
print(monthly_date_range)


DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31'],
              dtype='datetime64[ns]', freq='M')


- Generating a Date Range with a Specified Number of Periods:

In [21]:
# Generate a date range with 5 periods starting from January 1, 2022
period_date_range = pd.date_range(start='2022-01-01', periods=5)
print(period_date_range)


DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05'],
              dtype='datetime64[ns]', freq='D')


### Date Shifting

Date shifting in Pandas refers to the process of shifting (moving) the dates in a time series forward or backward by a specified number of time periods. This can be useful for various time series analysis tasks, such as comparing data points from different time periods, creating lag or lead variables, or aligning data for further analysis.

Pandas provides the shift() method to perform date shifting on a Series or DataFrame containing datetime indices.

Here's how you can use the shift() method for date shifting:

- **Shifting Dates Forward**:

In [22]:
import pandas as pd

# Create a sample DataFrame with datetime index
dates = pd.date_range(start='2022-01-01', periods=5, freq='D')
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data, index=dates)

# Shift the dates forward by 2 days
shifted_df = df.shift(periods=2)
print(shifted_df)


            Values
2022-01-01     NaN
2022-01-02     NaN
2022-01-03    10.0
2022-01-04    20.0
2022-01-05    30.0


- **Shifting Dates Backward**:

In [24]:
# Shift the dates backward by 1 day
shifted_df = df.shift(periods=-1)
print(shifted_df)


            Values
2022-01-01    20.0
2022-01-02    30.0
2022-01-03    40.0
2022-01-04    50.0
2022-01-05     NaN


### Frequency Conversion Or Resampling :


Frequency conversion in Pandas refers to the process of changing the frequency of a time series to a different frequency. This can involve upsampling (increasing the frequency) or downsampling (decreasing the frequency) of the data. Frequency conversion is commonly used in time series analysis to resample data to a frequency that is more suitable for analysis or visualization.

Pandas provides the resample() method for frequency conversion. This method allows you to specify the new frequency and apply an aggregation function to the data if necessary.

Here's how you can use the resample() method for frequency conversion:

- **Downsampling: Decreasing the Frequency**

In [30]:
import pandas as pd

# Create a sample DataFrame with datetime index
dates = pd.date_range(start='2022-01-01', periods=10, freq='D')
data = {'Values': range(10)}
df = pd.DataFrame(data, index=dates)

# Downsample the data from daily to weekly frequency, taking the mean value for each week
downsampled_df = df.resample('W').mean()
print(downsampled_df)


            Values
2022-01-02     0.5
2022-01-09     5.0
2022-01-16     9.0


- **Upsampling: Increasing the Frequency**

In [32]:
# Upsample the data from daily to hourly frequency, filling missing values with interpolation
upsampled_df = df.resample('H').interpolate()
print(upsampled_df.head(10))

                       Values
2022-01-01 00:00:00  0.000000
2022-01-01 01:00:00  0.041667
2022-01-01 02:00:00  0.083333
2022-01-01 03:00:00  0.125000
2022-01-01 04:00:00  0.166667
2022-01-01 05:00:00  0.208333
2022-01-01 06:00:00  0.250000
2022-01-01 07:00:00  0.291667
2022-01-01 08:00:00  0.333333
2022-01-01 09:00:00  0.375000


### Converting Categorical data to Numerical Data


Pandas provides several methods for converting categorical data to numerical data, which is essential for many machine learning algorithms that only accept numerical inputs. 

Here are some common methods:

- **Label Encoding**: Label encoding assigns a unique integer to each category in a categorical variable. It is suitable for ordinal categorical data where there is an inherent order among categories. Pandas provides the cat.codes attribute to perform label encoding.

In [33]:
import pandas as pd

# Create a DataFrame with a categorical variable
data = {'Category': ['A', 'B', 'C', 'A', 'B', 'C']}
df = pd.DataFrame(data)

# Perform label encoding
df['Category_LabelEncoded'] = df['Category'].astype('category').cat.codes
print(df)


  Category  Category_LabelEncoded
0        A                      0
1        B                      1
2        C                      2
3        A                      0
4        B                      1
5        C                      2


- **One-Hot Encoding**: One-hot encoding creates binary columns for each category in the variable. Each column indicates the presence or absence of a category in an observation. It is suitable for nominal categorical data where there is no inherent order among categories. Pandas provides the get_dummies() function to perform one-hot encoding.

In [34]:
# Perform one-hot encoding
one_hot_encoded_df = pd.get_dummies(df['Category'], prefix='Category_OneHot')
print(one_hot_encoded_df)


   Category_OneHot_A  Category_OneHot_B  Category_OneHot_C
0                  1                  0                  0
1                  0                  1                  0
2                  0                  0                  1
3                  1                  0                  0
4                  0                  1                  0
5                  0                  0                  1


- **Ordinal Encoding**: Ordinal encoding assigns integers to categories based on their order in a predefined list. It is suitable for ordinal categorical data where the categories have a specific order. You can use mapping with a dictionary or custom function to perform ordinal encoding.

In [35]:
# Define a mapping dictionary for ordinal encoding
mapping = {'A': 1, 'B': 2, 'C': 3}

# Perform ordinal encoding
df['Category_OrdinalEncoded'] = df['Category'].map(mapping)
print(df)


  Category  Category_LabelEncoded  Category_OrdinalEncoded
0        A                      0                        1
1        B                      1                        2
2        C                      2                        3
3        A                      0                        1
4        B                      1                        2
5        C                      2                        3


### dtype()

### astype()

In Pandas, the astype() method is used to cast a pandas object (e.g., DataFrame, Series) to a specified data type. 

It allows you to convert the data type of the elements within the object to a different data type.

Here's the syntax of the astype() method:

In [None]:
DataFrame.astype(dtype, copy=True, errors='raise')

Series.astype(dtype, copy=True, errors='raise')


    - dtype: Specifies the data type to which the elements will be cast.
    - copy: (Optional) Indicates whether to return a copy of the object with the new data type. Default is True.
    - errors: (Optional) Specifies how errors should be handled. Possible values are 'raise', 'ignore', and 'coerce'. Default is 'raise'.

- **Changing Data Types**: You can use astype() to change the data type of a DataFrame or Series to another data type. For example, converting numerical data to string or vice versa.

In [36]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Convert integer values in column 'A' to float
df['A'] = df['A'].astype(float)

print(df.dtypes)


A    float64
B      int64
dtype: object
