# Advanced Data Operations
### 1. Merging, Joining, and Concatenating DataFrames
Merging, joining, and concatenating DataFrames are essential operations for advanced data analysis.

In [1]:
import pandas as pd

In [2]:
# Creating sample DataFrames
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'key': ['K0', 'K1', 'K2', 'K3']
})

df2 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3'],
    'key': ['K0', 'K1', 'K2', 'K3']
})

# Merging DataFrames on a common key
merged_df = pd.merge(df1, df2, on='key')
merged_df

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K2,C2,D2
3,A3,B3,K3,C3,D3


### 2. Joining DataFrames with Different Keys
Join operations allow combining DataFrames on different columns or indexes.

In [3]:
df3 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'left_key': ['K0', 'K1', 'K2', 'K3']
})

df4 = pd.DataFrame({
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3'],
    'right_key': ['K0', 'K1', 'K2', 'K4']
})

# Performing a join on different keys
joined_df = df3.merge(df4, left_on='left_key', right_on='right_key', how='outer')
joined_df

Unnamed: 0,A,B,left_key,C,D,right_key
0,A0,B0,K0,C0,D0,K0
1,A1,B1,K1,C1,D1,K1
2,A2,B2,K2,C2,D2,K2
3,A3,B3,K3,,,
4,,,,C3,D3,K4


### 3. Working with Time Series Data (Date/Time Indexing)
Pandas allows easy manipulation and indexing of datetime objects.

In [4]:
import numpy as np

# Creating a date range
dates = pd.date_range('20230101', periods=10)

# Creating a DataFrame with the date index
df_time = pd.DataFrame(np.random.randn(10, 4), index=dates, columns=list('ABCD'))
df_time

Unnamed: 0,A,B,C,D
2023-01-01,-1.01568,1.236142,2.461412,-2.05237
2023-01-02,0.116609,0.11686,-0.020305,0.445906
2023-01-03,1.773577,-0.448391,-1.054172,-0.970925
2023-01-04,0.326539,1.552083,-0.107974,-0.628141
2023-01-05,0.965653,0.926,-0.294991,-0.457058
2023-01-06,-0.249184,-0.432646,1.845401,0.395997
2023-01-07,0.561234,-1.276479,-0.857479,-0.193743
2023-01-08,1.419959,-0.808924,0.610362,0.891876
2023-01-09,-1.100796,-1.533393,-0.607381,0.121537
2023-01-10,-0.810395,0.957642,-0.106758,2.154395


### 4. Resampling Time Series Data
Resampling is used to aggregate data over a new time period.

In [5]:
# Resample by week and calculate the sum
weekly_resampled = df_time.resample('W').sum()
weekly_resampled

Unnamed: 0,A,B,C,D
2023-01-01,-1.01568,1.236142,2.461412,-2.05237
2023-01-08,4.914387,-0.371496,0.120842,-0.516087
2023-01-15,-1.911191,-0.575752,-0.714139,2.275932


### 5. Dealing with Outliers
Outliers can be identified and dealt with using statistical methods.

In [6]:
# Detecting outliers using z-score
from scipy import stats

# Generating some random data
data = np.random.normal(0, 1, 100)
data[::10] = np.random.normal(10, 1, 10)  # Injecting outliers

# Using z-score to filter outliers
outliers = np.abs(stats.zscore(data)) > 3
cleaned_data = data[~outliers]

cleaned_data

array([ 8.35188449e+00,  1.05669864e+00, -3.95157737e-01,  3.08291703e-01,
       -2.35716303e+00, -1.49814837e+00, -1.60127695e-01, -1.00717389e+00,
        2.03729664e+00, -5.08229456e-01,  9.88117022e+00,  1.03451542e+00,
       -4.02641825e-01,  8.76049745e-01, -7.59925531e-01,  9.65658633e-01,
       -1.32401792e+00, -8.91567930e-01,  2.83365447e-02,  9.18457702e-01,
        9.14283003e+00, -7.03607172e-01, -2.00889850e+00,  1.69997087e+00,
       -6.02429436e-01, -1.39346843e-01, -7.58627195e-01,  5.51044960e-01,
        9.87653813e-01,  5.27083220e-01, -1.43083883e+00,  9.35588068e-01,
        7.20210566e-01,  7.41447644e-01,  8.09603404e-01,  4.73474593e-01,
       -2.62223310e-01,  9.97555630e-02, -4.10770513e-01,  8.53888359e+00,
        3.02377553e-01, -2.63074420e-01, -9.55861407e-02,  2.22583907e-01,
       -7.24531195e-01,  1.29441730e-01, -1.51036473e+00, -7.86731933e-01,
        5.10337325e-01,  9.95188571e+00,  7.75977212e-01,  5.04984946e-01,
       -7.22171913e-01, -

### 6. MultiIndex DataFrame Manipulations
MultiIndex allows hierarchical indexing in Pandas, useful for complex data.

In [7]:
# Creating a MultiIndex DataFrame
arrays = [
    ['A', 'A', 'B', 'B'],
    [1, 2, 1, 2]
]

index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Index'))
multi_index_df = pd.DataFrame({'Values': [10, 20, 30, 40]}, index=index)

# Accessing data in MultiIndex
multi_index_df.loc['A']

Unnamed: 0_level_0,Values
Index,Unnamed: 1_level_1
1,10
2,20


### 7. Working with Categorical Data
Pandas has a powerful categorical data type that saves memory and improves performance.

In [8]:
# Creating a categorical column
df_cat = pd.DataFrame({
    'fruits': ['apple', 'banana', 'apple', 'orange', 'banana'],
    'count': [5, 7, 8, 6, 9]
})

# Converting to categorical type
df_cat['fruits'] = df_cat['fruits'].astype('category')

# Displaying the categories
df_cat['fruits'].cat.categories

Index(['apple', 'banana', 'orange'], dtype='object')

### 8. Advanced String Manipulation
Pandas provides rich functionality for string operations.

In [9]:
# Creating a DataFrame with text data
df_text = pd.DataFrame({
    'Name': ['John Doe', 'Jane Smith', 'Mike Johnson', 'Emily Davis']
})

# Splitting names into first and last names
df_text[['First', 'Last']] = df_text['Name'].str.split(' ', expand=True)
df_text

Unnamed: 0,Name,First,Last
0,John Doe,John,Doe
1,Jane Smith,Jane,Smith
2,Mike Johnson,Mike,Johnson
3,Emily Davis,Emily,Davis


### 9. Working with Sparse Data
Sparse data structures in Pandas are useful for efficiently storing data with a large number of missing or default values.

In [10]:
# Creating a sparse DataFrame
df_sparse = pd.DataFrame({
    'A': [1, 0, 0, 3, 0],
    'B': [0, 0, 0, 0, 0]
}).astype(pd.SparseDtype("int", fill_value=0))

# Displaying the sparse DataFrame
df_sparse

Unnamed: 0,A,B
0,1,0
1,0,0
2,0,0
3,3,0
4,0,0
