In [1]:
import pandas as pd
import numpy as np

In [2]:
# Creating a DataFrame with duplicate and outlier values
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'],
    'Value': [10, 20, 15, 25, 30, 5, 1000, 5000]
}
df = pd.DataFrame(data)
print("DataFrame with duplicate and outlier values:")
print(df)

DataFrame with duplicate and outlier values:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [3]:
# Example 1: Removing duplicate rows from the DataFrame
df_no_duplicates = df.drop_duplicates()
print("\nExample 1:")
print("DataFrame after removing duplicate rows:")
print(df_no_duplicates)


Example 1:
DataFrame after removing duplicate rows:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [4]:
# Example 2: Removing duplicates based on a specific column
df_no_duplicates_category = df.drop_duplicates(subset='Category')
print("\nExample 2:")
print("DataFrame after removing duplicate rows based on 'Category' column:")
print(df_no_duplicates_category)


Example 2:
DataFrame after removing duplicate rows based on 'Category' column:
  Category  Value
0        A     10
2        B     15
4        C     30
6        D   1000


In [5]:
# Example 3: Handling outliers using z-score method
z_scores = np.abs((df['Value'] - df['Value'].mean()) / df['Value'].std())
df_no_outliers_zscore = df[z_scores < 3]
print("\nExample 3:")
print("DataFrame after removing outliers using z-score method:")
print(df_no_outliers_zscore)


Example 3:
DataFrame after removing outliers using z-score method:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5
6        D   1000
7        D   5000


In [6]:
# Example 4: Handling outliers using IQR (Interquartile Range) method
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers_iqr = df[(df['Value'] >= Q1 - 1.5 * IQR) & (df['Value'] <= Q3 + 1.5 * IQR)]
print("\nExample 4:")
print("DataFrame after removing outliers using IQR method:")
print(df_no_outliers_iqr)


Example 4:
DataFrame after removing outliers using IQR method:
  Category  Value
0        A     10
1        A     20
2        B     15
3        B     25
4        C     30
5        C      5


In [8]:
# Example 5: Replacing outlier values with median
median_value = df['Value'].median()
df_replace_outliers_median = df.copy()
df_replace_outliers_median.loc[z_scores >= 3, 'Value'] = median_value
print("\nExample 5:")
print("DataFrame after replacing outlier values with median:")
print(df_replace_outliers_median)


Example 5:
DataFrame after replacing outlier values with median:
  Category   Value
0        A    10.0
1        A    20.0
2        B    15.0
3        B    25.0
4        C    30.0
5        C     5.0
6        D  1000.0
7        D  5000.0


In [9]:
# Example 6: Replacing outlier values with mean
mean_value = df['Value'].mean()
df_replace_outliers_mean = df.copy()
df_replace_outliers_mean.loc[z_scores >= 3, 'Value'] = mean_value
print("\nExample 6:")
print("DataFrame after replacing outlier values with mean:")
print(df_replace_outliers_mean)


Example 6:
DataFrame after replacing outlier values with mean:
  Category   Value
0        A    10.0
1        A    20.0
2        B    15.0
3        B    25.0
4        C    30.0
5        C     5.0
6        D  1000.0
7        D  5000.0


In [10]:
# Example 7: Filling missing values with a specified value
df_with_missing = pd.DataFrame({'A': [1, 2, np.nan, 4]})
df_filled_with_value = df_with_missing.fillna(0)
print("\nExample 7:")
print("DataFrame after filling missing values with 0:")
print(df_filled_with_value)


Example 7:
DataFrame after filling missing values with 0:
     A
0  1.0
1  2.0
2  0.0
3  4.0


In [11]:
# Example 8: Filling missing values with the mean of the column
mean_A = df_with_missing['A'].mean()
df_filled_with_mean = df_with_missing.fillna(mean_A)
print("\nExample 8:")
print("DataFrame after filling missing values with the mean of 'A' column:")
print(df_filled_with_mean)


Example 8:
DataFrame after filling missing values with the mean of 'A' column:
          A
0  1.000000
1  2.000000
2  2.333333
3  4.000000


In [12]:
# Example 9: Filling missing values with the median of the column
median_A = df_with_missing['A'].median()
df_filled_with_median = df_with_missing.fillna(median_A)
print("\nExample 9:")
print("DataFrame after filling missing values with the median of 'A' column:")
print(df_filled_with_median)


Example 9:
DataFrame after filling missing values with the median of 'A' column:
     A
0  1.0
1  2.0
2  2.0
3  4.0


In [13]:
# Example 10: Dropping rows with missing values
df_dropped_missing = df_with_missing.dropna()
print("\nExample 10:")
print("DataFrame after dropping rows with missing values:")
print(df_dropped_missing)


Example 10:
DataFrame after dropping rows with missing values:
     A
0  1.0
1  2.0
3  4.0


In [14]:
# Example 11: Forward filling missing values
df_forward_filled = df_with_missing.ffill()
print("\nExample 11:")
print("DataFrame after forward filling missing values:")
print(df_forward_filled)


Example 11:
DataFrame after forward filling missing values:
     A
0  1.0
1  2.0
2  2.0
3  4.0


In [15]:
# Example 12: Backward filling missing values
df_backward_filled = df_with_missing.bfill()
print("\nExample 12:")
print("DataFrame after backward filling missing values:")
print(df_backward_filled)


Example 12:
DataFrame after backward filling missing values:
     A
0  1.0
1  2.0
2  4.0
3  4.0


In [16]:
# Example 13: Interpolating missing values
df_interpolated = df_with_missing.interpolate()
print("\nExample 13:")
print("DataFrame after interpolating missing values:")
print(df_interpolated)


Example 13:
DataFrame after interpolating missing values:
     A
0  1.0
1  2.0
2  3.0
3  4.0


In [17]:
# Example 14: Dropping rows with NaN values in specific columns
df_with_nan = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, 8]})
df_dropped_nan_column = df_with_nan.dropna(subset=['B'])
print("\nExample 14:")
print("DataFrame after dropping rows with NaN values in 'B' column:")
print(df_dropped_nan_column)


Example 14:
DataFrame after dropping rows with NaN values in 'B' column:
     A    B
0  1.0  5.0
2  NaN  7.0
3  4.0  8.0


In [18]:
# Example 15: Dropping columns with NaN values
df_dropped_nan_column = df_with_nan.dropna(axis=1)
print("\nExample 15:")
print("DataFrame after dropping columns with NaN values:")
print(df_dropped_nan_column)


Example 15:
DataFrame after dropping columns with NaN values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


In [19]:
# Example 16: Replacing missing values with ffill and bfill method
df_filled_with_ffill_bfill = df_with_missing.ffill().bfill()
print("\nExample 16:")
print("DataFrame after filling missing values with ffill and bfill method:")
print(df_filled_with_ffill_bfill)


Example 16:
DataFrame after filling missing values with ffill and bfill method:
     A
0  1.0
1  2.0
2  2.0
3  4.0


In [20]:
# Example 17: Replacing missing values with interpolate method
df_filled_with_interpolate = df_with_missing.interpolate()
print("\nExample 17:")
print("DataFrame after filling missing values with interpolate method:")
print(df_filled_with_interpolate)


Example 17:
DataFrame after filling missing values with interpolate method:
     A
0  1.0
1  2.0
2  3.0
3  4.0


In [21]:
# Example 18: Removing duplicate rows based on multiple columns
df_with_duplicates = pd.DataFrame({'A': [1, 2, 2, 4], 'B': [5, 5, 7, 8]})
df_no_duplicates_multi_cols = df_with_duplicates.drop_duplicates(subset=['A', 'B'])
print("\nExample 18:")
print("DataFrame after removing duplicate rows based on multiple columns:")
print(df_no_duplicates_multi_cols)


Example 18:
DataFrame after removing duplicate rows based on multiple columns:
   A  B
0  1  5
1  2  5
2  2  7
3  4  8


In [22]:
# Example 19: Removing duplicate rows and keeping the last occurrence
df_no_duplicates_last = df_with_duplicates.drop_duplicates(keep='last')
print("\nExample 19:")
print("DataFrame after removing duplicate rows and keeping the last occurrence:")
print(df_no_duplicates_last)


Example 19:
DataFrame after removing duplicate rows and keeping the last occurrence:
   A  B
0  1  5
1  2  5
2  2  7
3  4  8


In [23]:
# Example 20: Dropping duplicate rows based on a specific column
df_no_duplicates_A = df_with_duplicates.drop_duplicates(subset='A')
print("\nExample 20:")
print("DataFrame after removing duplicate rows based on 'A' column:")
print(df_no_duplicates_A)


Example 20:
DataFrame after removing duplicate rows based on 'A' column:
   A  B
0  1  5
1  2  5
3  4  8


In [24]:
# Example 21: Handling outliers using z-score method for multiple columns
df_outliers_multiple_cols = pd.DataFrame({'A': [1, 2, 15, 4], 'B': [1000, 2000, 25, 40]})
z_scores_multi_cols = np.abs((df_outliers_multiple_cols - df_outliers_multiple_cols.mean()) / df_outliers_multiple_cols.std())
df_no_outliers_multi_cols = df_outliers_multiple_cols[(z_scores_multi_cols < 3).all(axis=1)]
print("\nExample 21:")
print("DataFrame after removing outliers using z-score method for multiple columns:")
print(df_no_outliers_multi_cols)


Example 21:
DataFrame after removing outliers using z-score method for multiple columns:
    A     B
0   1  1000
1   2  2000
2  15    25
3   4    40


In [25]:
# Example 22: Handling outliers using IQR (Interquartile Range) method for multiple columns
Q1_multi_cols = df_outliers_multiple_cols.quantile(0.25)
Q3_multi_cols = df_outliers_multiple_cols.quantile(0.75)
IQR_multi_cols = Q3_multi_cols - Q1_multi_cols
df_no_outliers_multi_cols_iqr = df_outliers_multiple_cols[((df_outliers_multiple_cols >= Q1_multi_cols - 1.5 * IQR_multi_cols) &
                                                           (df_outliers_multiple_cols <= Q3_multi_cols + 1.5 * IQR_multi_cols)).all(axis=1)]
print("\nExample 22:")
print("DataFrame after removing outliers using IQR method for multiple columns:")
print(df_no_outliers_multi_cols_iqr)


Example 22:
DataFrame after removing outliers using IQR method for multiple columns:
   A     B
0  1  1000
1  2  2000
3  4    40


In [26]:
# Example 23: Replacing outlier values with median for multiple columns
median_A_multi = df_outliers_multiple_cols['A'].median()
median_B_multi = df_outliers_multiple_cols['B'].median()
df_replace_outliers_median_multi = df_outliers_multiple_cols.copy()
df_replace_outliers_median_multi.loc[z_scores_multi_cols['A'] >= 3, 'A'] = median_A_multi
df_replace_outliers_median_multi.loc[z_scores_multi_cols['B'] >= 3, 'B'] = median_B_multi
print("\nExample 23:")
print("DataFrame after replacing outlier values with median for multiple columns:")
print(df_replace_outliers_median_multi)


Example 23:
DataFrame after replacing outlier values with median for multiple columns:
    A     B
0   1  1000
1   2  2000
2  15    25
3   4    40


In [27]:
# Example 24: Replacing outlier values with mean for multiple columns
mean_A_multi = df_outliers_multiple_cols['A'].mean()
mean_B_multi = df_outliers_multiple_cols['B'].mean()
df_replace_outliers_mean_multi = df_outliers_multiple_cols.copy()
df_replace_outliers_mean_multi.loc[z_scores_multi_cols['A'] >= 3, 'A'] = mean_A_multi
df_replace_outliers_mean_multi.loc[z_scores_multi_cols['B'] >= 3, 'B'] = mean_B_multi
print("\nExample 24:")
print("DataFrame after replacing outlier values with mean for multiple columns:")
print(df_replace_outliers_mean_multi)


Example 24:
DataFrame after replacing outlier values with mean for multiple columns:
      A       B
0   1.0  1000.0
1   2.0  2000.0
2  15.0    25.0
3   4.0    40.0


In [28]:
# Example 25: Filling missing values with 0 for multiple columns
df_with_missing_multi = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, np.nan]})
df_filled_with_value_multi = df_with_missing_multi.fillna(0)
print("\nExample 25:")
print("DataFrame after filling missing values with 0 for multiple columns:")
print(df_filled_with_value_multi)


Example 25:
DataFrame after filling missing values with 0 for multiple columns:
     A    B
0  1.0  5.0
1  2.0  0.0
2  0.0  7.0
3  4.0  0.0
