In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("/content/drive/MyDrive/retail_price.csv")
print(df.describe())

class DataQualityChecks:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    def check_missing_values(self) -> pd.Series:
        return self.df.isnull().sum()

    def check_duplicates(self) -> int:
        return self.df.duplicated().sum()

    def check_data_types(self) -> pd.Series:
        return self.df.dtypes

quality_checker = DataQualityChecks(df)
print("Missing values:\n", quality_checker.check_missing_values())
print("Duplicate rows:", quality_checker.check_duplicates())
print("Data types:\n", quality_checker.check_data_types())

class CorrelationAnalysis:
    def __init__(self, df: pd.DataFrame):
        self.df = df.select_dtypes(include=[np.number])
        self.correlation_matrix = None

    def calculate_correlation(self, method='pearson') -> pd.DataFrame:
        self.correlation_matrix = self.df.corr(method=method)
        return self.correlation_matrix

    def plot_correlation_matrix(self):
        if self.correlation_matrix is None:
            raise ValueError("Correlation matrix is not calculated. Please run 'calculate_correlation()' first.")
        plt.figure(figsize=(10,10))
        sns.heatmap(self.correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
        plt.title("Correlation Matrix Heatmap")
        plt.show()

corr_analysis = CorrelationAnalysis(df)
corr_matrix = corr_analysis.calculate_correlation()
print(corr_matrix)
corr_matrix.to_csv('corr_matrix.csv')
corr_analysis.plot_correlation_matrix()

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    plt.figure(figsize=(10,6))
    df[col].hist(bins=30, edgecolor='black')
    plt.title(f'Distribution of {col}', fontsize=15)
    plt.xlabel(col, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.show()

class CategoricalDataAnalysis:
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def count_categories(self, column_name: str) -> pd.Series:
        return self.data[column_name].value_counts()

    def visualize_categories(self, column_name: str, figsize: tuple = (10, 6)):
        self.data[column_name].value_counts().plot(kind='bar', figsize=figsize)
        plt.title(f'Distribution of {column_name}')
        plt.xlabel('Category')
        plt.ylabel('Count')
        plt.show()