gfhfh

In [None]:
# Activity 2: Dealing with Duplicates & Redundancy

# Task A: Identifying Duplicate Records

# 7. Identify Complete Duplicates:
# - Load a dataset and identify duplicated rows.
# - Use Pandas to detect duplicates.






# 8. Identify Duplicates based on Specific Columns:
# - Check for duplicates in specified columns.







# 9. Count Duplicate Rows:
# - Calculate and print the number of duplicate rows.







In [None]:
# Task B: Deduplication Techniques

# 10. Remove Complete Duplicates:
# - Drop duplicate rows and keep only the first occurrence.






# 11. Subset Deduplication:
# - Remove duplicates based on a subset of columns.






# 12. Keep Last Occurrence:
# - Drop duplicates but keep the last occurrence in the dataset.







In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
import numpy as np

def perform_predictive_imputation(df, numeric_cols):
    """
    Performs ML-based imputation on a Pandas DataFrame for specified numeric columns.

    Args:
        df (pd.DataFrame): The input DataFrame.
        numeric_cols (list): A list of numeric column names to impute.

    Returns:
        pd.DataFrame: A DataFrame with missing values imputed using different methods.
    """
    df_imputed_simple = df.copy()
    print("\n--- 4. ML-based Imputation with SimpleImputer ---")
    for col in numeric_cols:
        if df_imputed_simple[col].isnull().any():
            imputer_mean = SimpleImputer(strategy='mean')
            df_imputed_simple[[col]] = imputer_mean.fit_transform(df_imputed_simple[[col]])
            print(f"Missing values in '{col}' imputed using mean.")
        else:
            print(f"No missing values in '{col}', SimpleImputer (mean) not applied.")

    df_imputed_regression = df.copy()
    print("\n--- 5. Imputation using a Regression Model ---")
    for col_to_impute in numeric_cols:
        if df_imputed_regression[col_to_impute].isnull().any():
            features = [col for col in numeric_cols if col != col_to_impute]
            if not features:
                print(f"Cannot perform regression imputation for '{col_to_impute}' as no other numeric columns available as features.")
                continue

            train_data = df_imputed_regression[df_imputed_regression[col_to_impute].notnull()].copy()
            test_data = df_imputed_regression[df_imputed_regression[col_to_impute].isnull()].copy().drop(columns=[col_to_impute])

            if train_data.empty or test_data.empty:
                print(f"Not enough complete or missing data in '{col_to_impute}' for regression imputation.")
                continue

            feature_imputer = SimpleImputer(strategy='mean')
            X_train = pd.DataFrame(feature_imputer.fit_transform(train_data[features]), columns=features, index=train_data.index)
            y_train = train_data[col_to_impute]
            X_test = pd.DataFrame(feature_imputer.transform(test_data[features]), columns=features, index=test_data.index)

            model = LinearRegression()
            model.fit(X_train, y_train)
            predicted_values = model.predict(X_test)
            df_imputed_regression.loc[df_imputed_regression[col_to_impute].isnull(), col_to_impute] = predicted_values
            print(f"Missing values in '{col_to_impute}' imputed using Linear Regression.")
        else:
            print(f"No missing values in '{col_to_impute}', Regression Imputation not applied.")

    df_imputed_knn = df.copy()
    print("\n--- 6. K-Nearest Neighbors Imputation ---")
    for col in numeric_cols:
        if df_imputed_knn[col].isnull().any():
            knn_imputer = KNNImputer(n_neighbors=5)
            df_imputed_knn[[col]] = knn_imputer.fit_transform(df_imputed_knn[[col]])
            print(f"Missing values in '{col}' imputed using KNN (n_neighbors=5).")
        else:
            print(f"No missing values in '{col}', KNNImputer not applied.")

    return df_imputed_simple, df_imputed_regression, df_imputed_knn

# Example Usage:
data = {'col1': [1, 2, np.nan, 4, np.nan],
        'col2': [np.nan, 7, np.nan, 9, np.nan],
        'col3': ['a', 'b', 'c', 'd', 'e'],
        'col4': [10.5, np.nan, 12.3, 14.7, 15.0]}
df = pd.DataFrame(data)

numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

df_simple_imputed, df_regression_imputed, df_knn_imputed = perform_predictive_imputation(df.copy(), numeric_columns)

print("\n--- Imputed DataFrames ---")
print("\nDataFrame after SimpleImputer:")
print(df_simple_imputed)

print("\nDataFrame after Regression Imputation:")
print(df_regression_imputed)

print("\nDataFrame after KNN Imputation:")
print(df_knn_imputed)


--- 4. ML-based Imputation with SimpleImputer ---
Missing values in 'col1' imputed using mean.
Missing values in 'col2' imputed using mean.
Missing values in 'col4' imputed using mean.

--- 5. Imputation using a Regression Model ---
Missing values in 'col1' imputed using Linear Regression.
Missing values in 'col2' imputed using Linear Regression.
Missing values in 'col4' imputed using Linear Regression.

--- 6. K-Nearest Neighbors Imputation ---
Missing values in 'col1' imputed using KNN (n_neighbors=5).
Missing values in 'col2' imputed using KNN (n_neighbors=5).
Missing values in 'col4' imputed using KNN (n_neighbors=5).

--- Imputed DataFrames ---

DataFrame after SimpleImputer:
       col1  col2 col3    col4
0  1.000000   8.0    a  10.500
1  2.000000   7.0    b  13.125
2  2.333333   8.0    c  12.300
3  4.000000   9.0    d  14.700
4  2.333333   8.0    e  15.000

DataFrame after Regression Imputation:
       col1      col2 col3       col4
0  1.000000  6.000000    a  10.500000
1  2.00