<a href="https://colab.research.google.com/github/sudhan670/Acadia/blob/main/Acadia_DataScience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy seaborn matplotlib plotly reportlab

Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5


In [18]:
!pip install ydata-profiling




In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from fpdf import FPDF
from ydata_profiling import ProfileReport
import os

# Define the main analysis class
class DataAnalyzer:
    def __init__(self, file_path):
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        """Load data from CSV or Excel file."""
        try:
            if file_path.endswith('.csv'):
                return pd.read_csv(file_path)
            elif file_path.endswith('.xlsx'):
                return pd.read_excel(file_path)
            else:
                raise ValueError("Unsupported file format. Use CSV or Excel.")
        except Exception as e:
            print(f"Error loading file: {e}")
            raise

    def list_missing_values(self):
        """List columns with missing values."""
        missing = self.data.isnull().sum()
        return missing[missing > 0]

    def categorize_columns(self):
        """Categorize columns by data type."""
        numeric = self.data.select_dtypes(include=['number']).columns.tolist()
        categorical = self.data.select_dtypes(include=['object']).columns.tolist()
        datetime = self.data.select_dtypes(include=['datetime']).columns.tolist()
        return {'numeric': numeric, 'categorical': categorical, 'datetime': datetime}

    def list_duplicate_columns(self):
        """List and remove duplicate columns."""
        duplicates = self.data.columns[self.data.T.duplicated()]
        before = self.data.shape
        self.data = self.data.loc[:, ~self.data.columns.duplicated()]
        after = self.data.shape
        return duplicates, before, after

    def list_constant_columns(self):
        """List and remove constant columns."""
        constant_cols = [col for col in self.data.columns if self.data[col].nunique() == 1]
        before = self.data.shape
        self.data = self.data.drop(columns=constant_cols)
        after = self.data.shape
        return constant_cols, before, after

    def visualize_outliers(self, save_path):
        """Create box plots for numeric columns."""
        numeric_cols = self.categorize_columns()['numeric']
        for col in numeric_cols:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.data[col])
            plt.title(f'Boxplot of {col}')
            plt.savefig(os.path.join(save_path, f"boxplot_{col}.png"))
            plt.close()

    def visualize_distributions(self, save_path, columns):
        """Create charts for distributions."""
        for col in columns:
            plt.figure(figsize=(10, 6))
            if self.data[col].dtype in ['int64', 'float64']:
                sns.histplot(self.data[col], kde=True)
            else:
                sns.countplot(y=self.data[col])
            plt.title(f'Distribution of {col}')
            plt.savefig(os.path.join(save_path, f"distribution_{col}.png"))
            plt.close()

    def generate_report(self, output_path):
        """Generate an HTML report using pandas-profiling."""
        profile = ProfileReport(self.data, title="Data Analysis Report", explorative=True)
        profile.to_file(output_path)

    def save_as_pdf(self, output_path, images_path):
        """Save analysis results as a PDF report."""
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=12)
        pdf.cell(200, 10, txt="Data Analysis Report", ln=True, align='C')

        for img_file in os.listdir(images_path):
            if img_file.endswith('.png'):
                pdf.add_page()
                pdf.image(os.path.join(images_path, img_file), x=10, y=20, w=190)

        pdf.output(output_path)

# Example usage
def main():
    file_path = input("Enter the path to your dataset (CSV/Excel): ")
    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)

    analyzer = DataAnalyzer(file_path)

    # Missing values
    missing_values = analyzer.list_missing_values()
    print("Missing Values:")
    print(missing_values)

    # Categorize columns
    column_categories = analyzer.categorize_columns()
    print("Column Categories:")
    print(column_categories)

    # Duplicates
    duplicates, before, after = analyzer.list_duplicate_columns()
    print(f"Duplicate Columns: {duplicates}")
    print(f"Shape before: {before}, Shape after: {after}")

    # Constants
    constants, before, after = analyzer.list_constant_columns()
    print(f"Constant Columns: {constants}")
    print(f"Shape before: {before}, Shape after: {after}")

    # Visualizations
    images_path = os.path.join(output_dir, "images")
    os.makedirs(images_path, exist_ok=True)

    analyzer.visualize_outliers(images_path)
    analyzer.visualize_distributions(images_path, columns=column_categories['numeric'][:6])

    # Reports
    html_report_path = os.path.join(output_dir, "report.html")
    analyzer.generate_report(html_report_path)

    pdf_report_path = os.path.join(output_dir, "report.pdf")
    analyzer.save_as_pdf(pdf_report_path, images_path)

    print(f"Reports generated at {output_dir}")

if __name__ == "__main__":
    main()


Enter the path to your dataset (CSV/Excel): /content/Acadia.csv
Missing Values:
STATE                                                   59
TENURE_IN_MONTHS                                        66
CLOSESTSTOREDISTANCE                                  1545
AGE                                                   5945
INCOME                                                3259
LENGTH OF RESIDENCE                                   2872
NUMBER OF PERSONS IN LIVING UNIT                      2872
NUMBER OF ADULTS IN LIVING UNIT                       2872
MOSAIC                                                1868
CAPE: AGE: POP: MEDIAN AGE                            2872
CAPE: AGE: POP: % 0-17                                2872
CAPE: AGE: POP: % 18-99+                              2872
CAPE: AGE: POP: % 65-99+                              2872
CAPE: ETHNIC: POP: % WHITE ONLY                       2872
CAPE: ETHNIC: POP: % BLACK ONLY                       2872
CAPE: ETHNIC: POP: % ASIAN ONLY    

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]