In [5]:
# Question: Winsorization of Data
# Description: Apply Winsorization to a given dataset to handle outliers.



In [6]:
import pandas as pd
import numpy as np
from scipy.stats.mstats import winsorize

# Let's create a DataFrame with some outliers
data_winsorize = {'Value': [10, 15, 12, 18, 20, 8, 25, 30, 5, 100, -5, 22]}
df_winsorize = pd.DataFrame(data_winsorize)

print("Original DataFrame with outliers:\n", df_winsorize)

# 1. Apply Winsorization
# We'll winsorize the 'Value' column by setting the limits at 10% from each end.
# This means the bottom 10% of values will be replaced by the value at the 10th percentile,
# and the top 10% will be replaced by the value at the 90th percentile.
winsorized_value = winsorize(df_winsorize['Value'], limits=(0.1, 0.1))

# Create a new DataFrame with the winsorized values
df_winsorized = pd.DataFrame({'Winsorized_Value': winsorized_value})

print("\nDataFrame after Winsorization (10% limits):\n", df_winsorized)

# Let's see the effect on the original values
df_comparison = pd.concat([df_winsorize, df_winsorized], axis=1)
print("\nComparison of Original and Winsorized Values:\n", df_comparison.sort_values(by='Value'))

# You can also apply different limits to each end
winsorized_value_diff_limits = winsorize(df_winsorize['Value'], limits=(0.2, 0.05)) # 20% lower, 5% upper
df_winsorized_diff_limits = pd.DataFrame({'Winsorized_Value_Diff_Limits': winsorized_value_diff_limits})
df_comparison_diff_limits = pd.concat([df_winsorize, df_winsorized_diff_limits], axis=1)
print("\nComparison with different Winsorization limits (20% lower, 5% upper):\n", df_comparison_diff_limits.sort_values(by='Value'))

Original DataFrame with outliers:
     Value
0      10
1      15
2      12
3      18
4      20
5       8
6      25
7      30
8       5
9     100
10     -5
11     22

DataFrame after Winsorization (10% limits):
     Winsorized_Value
0                 10
1                 15
2                 12
3                 18
4                 20
5                  8
6                 25
7                 30
8                  5
9                 30
10                 5
11                22

Comparison of Original and Winsorized Values:
     Value  Winsorized_Value
10     -5                 5
8       5                 5
5       8                 8
0      10                10
2      12                12
1      15                15
3      18                18
4      20                20
11     22                22
6      25                25
7      30                30
9     100                30

Comparison with different Winsorization limits (20% lower, 5% upper):
     Value  Winsorized_Value_Diff