## AI & Machine Learning for Data Quality
**Description**: AI and machine learning can automate and enhance data quality checks by learning patterns and identifying anomalies more effectively than static rules.

**Task 1**: Training a model to predict and flag unusual trend patterns in sales data that
deviate from historical norms.

In [None]:
# write your code from here

**Task 2**: Using clustering algorithms to detect duplicate records where entries are not
exactly identical.

In [None]:
# write your code from here

**Task 3**: Implementing classification models to validate data based on learned
characteristics from labeled datasets.

In [None]:
# write your code from here


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings

warnings.filterwarnings('ignore')


# --- MODULES / FUNCTIONS --- #

def detect_sales_anomalies(df, column='sales', contamination=0.05):
    try:
        model = IsolationForest(contamination=contamination, random_state=42)
        df['anomaly'] = model.fit_predict(df[[column]])
        df['is_anomaly'] = df['anomaly'] == -1
        return df
    except Exception as e:
        print("Error during anomaly detection:", e)
        return df


def plot_anomalies(df):
    try:
        plt.figure(figsize=(12, 5))
        plt.plot(df['date'], df['sales'], label='Sales')
        plt.scatter(df[df['is_anomaly']]['date'], df[df['is_anomaly']]['sales'], color='red', label='Anomaly')
        plt.title("Sales Data with Anomaly Detection")
        plt.xlabel("Date")
        plt.ylabel("Sales")
        plt.legend()
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print("Error during plotting:", e)


def cluster_similar_names(data):
    try:
        tfidf = TfidfVectorizer()
        name_vectors = tfidf.fit_transform(data['name'])
        kmeans = KMeans(n_clusters=3, random_state=42)
        clusters = kmeans.fit_predict(name_vectors)
        data['cluster'] = clusters
        return data
    except Exception as e:
        print("Error during clustering:", e)
        return data


def classify_data_quality(df):
    try:
        X = df.drop('label', axis=1)
        y = df['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("\nClassification Report:\n", classification_report(y_test, y_pred))
        sns.barplot(x=clf.feature_importances_, y=X.columns)
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.show()
        return clf
    except Exception as e:
        print("Error during model training or evaluation:", e)
        return None


# --- MAIN EXECUTION --- #

def main():
    # 1. Anomaly Detection in Sales Data
    np.random.seed(42)
    dates = pd.date_range(start='2022-01-01', periods=180)
    sales = np.random.normal(loc=200, scale=20, size=len(dates))
    sales[::30] += np.random.normal(100, 30, 6)
    df_sales = pd.DataFrame({'date': dates, 'sales': sales})
    df_sales = detect_sales_anomalies(df_sales)
    plot_anomalies(df_sales)

    # 2. Duplicate Detection Using Clustering
    data = pd.DataFrame({
        'name': ['Jon Doe', 'John Doe', 'J. Doe', 'Jane Smith', 'Janet Smith', 'J. Smith', 'Alice Brown', 'Alicia Brown'],
        'email': ['jon@example.com', 'john@example.com', 'jd@example.com',
                  'jane@example.com', 'janet@example.com', 'js@example.com',
                  'aliceb@example.com', 'alicia@example.com']
    })
    clustered_data = cluster_similar_names(data)
    print("\nPotential Duplicate Groups:")
    for c in clustered_data['cluster'].unique():
        print(f"\nCluster {c}:\n", clustered_data[clustered_data['cluster'] == c])

    # 3. Data Quality Classification
    df_quality = pd.DataFrame({
        'null_ratio': np.random.uniform(0, 1, 500),
        'outlier_count': np.random.randint(0, 5, 500),
        'type_mismatch': np.random.randint(0, 2, 500),
        'duplicate_flag': np.random.randint(0, 2, 500),
        'label': np.random.choice([0, 1], 500, p=[0.7, 0.3])  # 0: valid, 1: invalid
    })
    _ = classify_data_quality(df_quality)


# --- UNIT TESTS --- #

def run_tests():
    # Test 1: Anomaly Detection Output
    test_df = pd.DataFrame({'sales': [100, 150, 200, 1000, 130, 120]})
    result = detect_sales_anomalies(test_df.copy())
    assert 'is_anomaly' in result.columns, "Missing 'is_anomaly' column"

    # Test 2: Clustering Output
    test_data = pd.DataFrame({'name': ['A', 'B'], 'email': ['a@a.com', 'b@b.com']})
    clustered = cluster_similar_names(test_data.copy())
    assert 'cluster' in clustered.columns, "Missing 'cluster' column"

    # Test 3: Classifier doesn't return None
    mock_df = pd.DataFrame({
        'null_ratio': [0.1, 0.2],
        'outlier_count': [1, 2],
        'type_mismatch': [0, 1],
        'duplicate_flag': [1, 0],
        'label': [0, 1]
    })
    model = classify_data_quality(mock_df)
    assert model is not None, "Classifier returned None"

    print("✅ All tests passed!")


if __name__ == "__main__":
    run_tests()
    main()


Error during clustering: empty vocabulary; perhaps the documents only contain stop words


AssertionError: Missing 'cluster' column