In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a DataFrame with duplicate and outlier values
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
    'Value': [10, 20, 15, 25, 30, 5, 1000, 5000]
}
df = pd.DataFrame(data)
print("DataFrame with duplicate and outlier values:")
print(df)

DataFrame with duplicate and outlier values:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [3]:
# Example 1: Removing duplicate rows from the DataFrame
df_no_duplicates = df.drop_duplicates()
print("\nExample 1:")
print("DataFrame after removing duplicate rows:")
print(df_no_duplicates)


Example 1:
DataFrame after removing duplicate rows:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [4]:
# Example 2: Removing duplicates based on a specific column
df_no_duplicates_category = df.drop_duplicates(subset='Category')
print("\nExample 2:")
print("DataFrame after removing duplicate rows based on 'Category' column:")
print(df_no_duplicates_category)


Example 2:
DataFrame after removing duplicate rows based on 'Category' column:
  Category  Value
0        A     10
2        B     15
4        C     30
6        D   1000


In [5]:
# Example 3: Handling outliers using z-score method
z_scores = np.abs((df['Value'] - df['Value'].mean()) / df['Value'].std())
df_no_outliers_zscore = df[z_scores < 3]
print("\nExample 3:")
print("DataFrame after removing outliers using z-score method:")
print(df_no_outliers_zscore)


Example 3:
DataFrame after removing outliers using z-score method:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [6]:
# Example 4: Handling outliers using IQR (Interquartile Range) method
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers_iqr = df[(df['Value'] >= Q1 - 1.5 * IQR) & (df['Value'] <= Q3 + 1.5 * IQR)]
print("\nExample 4:")
print("DataFrame after removing outliers using IQR method:")
print(df_no_outliers_iqr)


Example 4:
DataFrame after removing outliers using IQR method:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5


In [8]:
# Example 5: Replacing outlier values with median
median_value = df['Value'].median()
df_replace_outliers_median = df.copy()
df_replace_outliers_median.loc[z_scores >= 3, 'Value'] = median_value
print("\nExample 5:")
print("DataFrame after replacing outlier values with median:")
print(df_replace_outliers_median)


Example 5:
DataFrame after replacing outlier values with median:
  Category   Value
0        A    10.0
1        A    20.0
2        B    15.0
3        B    25.0
4        C    30.0
5        C     5.0
6        D  1000.0
7        D  5000.0


In [9]:
# Example 6: Replacing outlier values with mean
mean_value = df['Value'].mean()
df_replace_outliers_mean = df.copy()
df_replace_outliers_mean.loc[z_scores >= 3, 'Value'] = mean_value
print("\nExample 6:")
print("DataFrame after replacing outlier values with mean:")
print(df_replace_outliers_mean)


Example 6:
DataFrame after replacing outlier values with mean:
  Category   Value
0        A    10.0
1        A    20.0
2        B    15.0
3        B    25.0
4        C    30.0
5        C     5.0
6        D  1000.0
7        D  5000.0


In [10]:
# Example 7: Filling missing values with a specified value
df_with_missing = pd.DataFrame({'A': [1, 2, np.nan, 4]})
df_filled_with_value = df_with_missing.fillna(0)
print("\nExample 7:")
print("DataFrame after filling missing values with 0:")
print(df_filled_with_value)


Example 7:
DataFrame after filling missing values with 0:
     A
0  1.0
1  2.0
2  0.0
3  4.0


In [11]:
# Example 8: Filling missing values with the mean of the column
mean_A = df_with_missing['A'].mean()
df_filled_with_mean = df_with_missing.fillna(mean_A)
print("\nExample 8:")
print("DataFrame after filling missing values with the mean of 'A' column:")
print(df_filled_with_mean)


Example 8:
DataFrame after filling missing values with the mean of 'A' column:
          A
0  1.000000
1  2.000000
2  2.333333
3  4.000000


In [12]:
# Example 9: Filling missing values with the median of the column
median_A = df_with_missing['A'].median()
df_filled_with_median = df_with_missing.fillna(median_A)
print("\nExample 9:")
print("DataFrame after filling missing values with the median of 'A' column:")
print(df_filled_with_median)


Example 9:
DataFrame after filling missing values with the median of 'A' column:
     A
0  1.0
1  2.0
2  2.0
3  4.0


In [13]:
# Example 10: Dropping rows with missing values
df_dropped_missing = df_with_missing.dropna()
print("\nExample 10:")
print("DataFrame after dropping rows with missing values:")
print(df_dropped_missing)


Example 10:
DataFrame after dropping rows with missing values:
     A
0  1.0
1  2.0
3  4.0
