In [20]:
import pandas as pd
import numpy as np



### Advanced Indexing

In [21]:

# Creating a sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emma'],
    'Age': [25, 30, 35, 28, 40],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'Salary': [50000, 60000, 70000, 55000, 75000],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'Marketing'],
    'Experience': [3, 5, 7, 4, 6]
}
df = pd.DataFrame(data)


df

# Boolean indexing
subset = df[df['Age'] > 18]

# Selecting rows and columns by label
subset = df.loc[df['Age'] > 18, ['Name', 'Age']]

# Selecting rows and columns by integer position
subset = df.iloc[2:5, 0:2]

# Fast scalar value access
value = df.at[0, 'Age']


### Handling Time Series Data

In [22]:
# Creating a sample DataFrame
data = {
    'Date': pd.date_range('2024-01-01', periods=10),
    'Value': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
df = pd.DataFrame(data)

# Shifting data
df['Shifted_Value'] = df['Value'].shift(1)

# Rolling window calculations
df['Rolling_Mean'] = df['Value'].rolling(window=3).mean()

# Printing the DataFrame
print("DataFrame with Shifted and Rolling Mean columns:")
print(df)

DataFrame with Shifted and Rolling Mean columns:
        Date  Value  Shifted_Value  Rolling_Mean
0 2024-01-01      1            NaN           NaN
1 2024-01-02      2            1.0           NaN
2 2024-01-03      3            2.0           2.0
3 2024-01-04      4            3.0           3.0
4 2024-01-05      5            4.0           4.0
5 2024-01-06      6            5.0           5.0
6 2024-01-07      7            6.0           6.0
7 2024-01-08      8            7.0           7.0
8 2024-01-09      9            8.0           8.0
9 2024-01-10     10            9.0           9.0


### Reshaping Dat

In [23]:
import pandas as pd
import numpy as np

# Creating a sample DataFrame
data = {
    'Index_Column': ['A', 'A', 'B', 'B'],
    'Column_to_Pivot': ['X', 'Y', 'X', 'Y'],
    'Value_Column': [1, 2, 3, 4]
}
df = pd.DataFrame(data)

# Pivot table
pivot_table = df.pivot_table(index='Index_Column', columns='Column_to_Pivot', values='Value_Column', aggfunc='mean')

# Melting
melted_df = pd.melt(df, id_vars=['Index_Column'], value_vars=['Column_to_Pivot', 'Value_Column'], var_name='Variable', value_name='Value')

# Stacking and unstacking
stacked = df.set_index(['Index_Column', 'Column_to_Pivot']).stack()
unstacked = df.set_index(['Index_Column', 'Column_to_Pivot']).unstack()

# Printing the results
print("Pivot Table:")
print(pivot_table)

print("\nMelted DataFrame:")
print(melted_df)

print("\nStacked DataFrame:")
print(stacked)

print("\nUnstacked DataFrame:")
print(unstacked)


Pivot Table:
Column_to_Pivot    X    Y
Index_Column             
A                1.0  2.0
B                3.0  4.0

Melted DataFrame:
  Index_Column         Variable Value
0            A  Column_to_Pivot     X
1            A  Column_to_Pivot     Y
2            B  Column_to_Pivot     X
3            B  Column_to_Pivot     Y
4            A     Value_Column     1
5            A     Value_Column     2
6            B     Value_Column     3
7            B     Value_Column     4

Stacked DataFrame:
Index_Column  Column_to_Pivot              
A             X                Value_Column    1
              Y                Value_Column    2
B             X                Value_Column    3
              Y                Value_Column    4
dtype: int64

Unstacked DataFrame:
                Value_Column   
Column_to_Pivot            X  Y
Index_Column                   
A                          1  2
B                          3  4


### Custom Functions

In [24]:
# Define a custom function
def custom_function(x):
    return x ** 2

# Creating a sample DataFrame
data = {
    'Column': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)

# Apply function to column
df['New_Column'] = df['Column'].apply(lambda x: custom_function(x))

# Apply function element-wise
df = df.applymap(lambda x: custom_function(x))

# Printing the DataFrame
print("DataFrame with New_Column and element-wise custom function applied:")
print(df)

DataFrame with New_Column and element-wise custom function applied:
   Column  New_Column
0       1           1
1       4          16
2       9          81
3      16         256
4      25         625


  df = df.applymap(lambda x: custom_function(x))


### Categorical Data Handling

In [25]:
# Creating a sample DataFrame
data = {
    'Categorical_Column': ['A', 'B', 'A', 'C', 'B', 'A', 'A', 'C']
}
df = pd.DataFrame(data)

# Converting column to categorical type
df['Categorical_Column'] = df['Categorical_Column'].astype('category')

# Using categorical data for better memory usage and speed
value_counts = df['Categorical_Column'].value_counts()

# Printing the value counts
print("Value counts of Categorical_Column:")
print(value_counts)

Value counts of Categorical_Column:
Categorical_Column
A    4
B    2
C    2
Name: count, dtype: int64


### Working with Large Data

In [26]:
# Define a function to process each chunk
def process(chunk):
    # Add your processing logic here
    print("Processing chunk with shape:", chunk.shape)

# Reading data in chunks
chunk_iterator = pd.read_csv('data.csv', chunksize=1000)
for chunk in chunk_iterator:
    process(chunk)

Processing chunk with shape: (5, 4)


### Method Chaining

In [27]:
# Creating a sample DataFrame
data = {
    'Column1': [1, 2, 3, -1, -2, -3, 4, 5],
    'Column2': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Value1': [10, 20, 30, 40, 50, 60, 70, 80],
    'Value2': [100, 200, 300, 400, 500, 600, 700, 800]
}
df = pd.DataFrame(data)

# Perform the query, group by 'Column2', and calculate the mean
result = df.query('Column1 > 0').groupby('Column2').mean()

# Print the result
print("Mean values for each group in 'Column2' after filtering 'Column1' values > 0:")
print(result)

Mean values for each group in 'Column2' after filtering 'Column1' values > 0:
          Column1     Value1      Value2
Column2                                 
A        2.666667  36.666667  366.666667
B        3.500000  50.000000  500.000000
