In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class CSVDataAnalyzer:
    def __init__(self, dataframe):
        self.df = dataframe
        self.original_df = dataframe.copy()

    def missing_values_analysis(self):
        missing_cols = self.df.columns[self.df.isnull().any()].tolist()
        return {col: self.df[col].isnull().sum() for col in missing_cols}

    def categorize_columns(self):
        return {
            'Numeric': self.df.select_dtypes(include=['int64', 'float64']).columns.tolist(),
            'Categorical': self.df.select_dtypes(include=['object', 'category']).columns.tolist(),
            'DateTime': self.df.select_dtypes(include=['datetime64']).columns.tolist()
        }

    def duplicate_column_analysis(self, remove=False):
        duplicate_cols = self.df.columns[self.df.T.duplicated()].tolist()

        if remove and duplicate_cols:
            self.df = self.df.loc[:,~self.df.columns.duplicated()]

        return {
            'duplicate_columns': duplicate_cols,
            'original_shape': self.original_df.shape,
            'new_shape': self.df.shape
        }

    def constant_column_analysis(self, remove=False):
        constant_cols = [col for col in self.df.columns if self.df[col].nunique() <= 1]

        if remove and constant_cols:
            self.df = self.df.drop(columns=constant_cols)

        return {
            'constant_columns': constant_cols,
            'original_shape': self.original_df.shape,
            'new_shape': self.df.shape
        }

    def plot_outliers(self, save_path='outliers_boxplot.png'):
        numeric_cols = self.df.select_dtypes(include=['int64', 'float64']).columns

        plt.figure(figsize=(15, 6))
        self.df[numeric_cols].boxplot()
        plt.title('Outliers in Numeric Columns')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()

    def plot_distributions(self, num_cols=6, save_path='distributions.png'):
        columns = self.df.columns[:num_cols]

        plt.figure(figsize=(15, 10))
        for i, col in enumerate(columns, 1):
            plt.subplot(2, 3, i)
            if pd.api.types.is_numeric_dtype(self.df[col]):
                sns.histplot(self.df[col], kde=True)
            else:
                self.df[col].value_counts().plot(kind='bar')
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45)

        plt.tight_layout()
        plt.savefig(save_path)
        plt.close()

    def generate_comprehensive_analysis(self):
        print("Missing Values Analysis:")
        print(self.missing_values_analysis())
        print("\nColumn Categorization:")
        print(self.categorize_columns())
        print("\nDuplicate Column Analysis:")
        print(self.duplicate_column_analysis())
        print("\nConstant Column Analysis:")
        print(self.constant_column_analysis())

        self.plot_outliers()
        self.plot_distributions()


df = pd.read_excel("data.xlsx")
analyzer = CSVDataAnalyzer(df)
analyzer.generate_comprehensive_analysis()

Missing Values Analysis:
{'STATE': 59, 'TENURE_IN_MONTHS': 66, 'CLOSESTSTOREDISTANCE': 1545, 'AGE': 5945, 'INCOME': 3259, 'LENGTH OF RESIDENCE': 2872, 'NUMBER OF PERSONS IN LIVING UNIT': 2872, 'NUMBER OF ADULTS IN LIVING UNIT': 2872, 'MOSAIC': 1868, 'CAPE: AGE: POP: MEDIAN AGE': 2872, 'CAPE: AGE: POP: % 0-17': 2872, 'CAPE: AGE: POP: % 18-99+': 2872, 'CAPE: AGE: POP: % 65-99+': 2872, 'CAPE: ETHNIC: POP: % WHITE ONLY': 2872, 'CAPE: ETHNIC: POP: % BLACK ONLY': 2872, 'CAPE: ETHNIC: POP: % ASIAN ONLY': 2872, 'CAPE: ETHNIC: POP: % HISPANIC': 2872, 'CAPE: DENSITY: PERSONS PER HH FOR POP IN HH': 2872, 'CAPE: HHSIZE: HH: AVERAGE HOUSEHOLD SIZE': 2872, 'CAPE: TYP: HH: % MARRIED COUPLE FAMILY': 2872, 'CAPE: CHILD: HH: % WITH PERSONS LT18': 2872, 'CAPE: CHILD: HH: % MARR COUPLE FAMW- PERSONS LT18': 2872, 'CAPE: CHILD: HH: % MARR COUPLE FAMW-O PERSONS LT18': 2872, 'CAPE: LANG: HH: % SPANISH SPEAKING': 2872, 'CAPE: EDUC: POP25+: MEDIAN EDUCATION ATTAINED': 2872, 'CAPE: HOMVAL: OOHU: MEDIAN HOME VALU