In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO
import os
import textwrap
from io import StringIO
from tabulate import tabulate
from collections import Counter
from wordcloud import WordCloud

In [2]:
def Save_Genreal_Report(corr_matrix, dup, head, tail, des, null, info, shape):
   
     try:
        folder_path = input("Enter the full path or name of the folder to save the report and heatmap: ").strip()
        os.makedirs(folder_path, exist_ok=True)
        report = os.path.join(folder_path, "report.txt")
        heatmap_file = os.path.join(folder_path, "correlation_heatmap.png")
       

        rows, cols = corr_matrix.shape
        max_label_length = max(len(label) for label in corr_matrix.columns)

        fig_width = max(8, cols * 0.5, max_label_length * 0.5)
        fig_height = max(6, rows * 0.5)

        plt.figure(figsize=(fig_width, fig_height))
        plt.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
        plt.colorbar()
        plt.xticks(np.arange(cols), corr_matrix.columns, rotation=45, ha="right", fontsize=12)
        plt.yticks(np.arange(rows), corr_matrix.columns, rotation=0, fontsize=12)
        plt.title('Correlation Matrix Heatmap', fontsize=14)
        plt.tight_layout()
        plt.savefig(heatmap_file)  
        plt.close()
        with open(report, 'w') as f:
            f.write("GENERAL ANALYSIS OF THE DATAFRAME\n")
            f.write("*" * 50 + "\n\n")
            f.write("-" * 50 + "\n")
            f.write(f"Shape of DataFrame: {shape}\n")
            f.write("-" * 50 + "\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("Column Information:\n")
            f.write("-" * 50 + "\n")
            f.write(info + "\n") 
            f.write("\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("Missing Values in Each Column:\n")
            f.write("-" * 50 + "\n")
            f.write(str(null) + "\n")
            f.write("\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("Descriptive Statistics (Numerical):\n")
            f.write("-" * 50 + "\n")
            f.write(str(des) + "\n")
            # f.write(tabulate(des, headers='keys', tablefmt='psql'))
            f.write("\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("First 5 Rows of the DataFrame:\n")
            f.write("-" * 50 + "\n")
            f.write(str(head) + "\n")
            f.write("\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("Last 5 Rows of the DataFrame:\n")
            f.write("-" * 50 + "\n")
            f.write(str(tail) + "\n")
            f.write("\n\n\n")
            f.write("-" * 50 + "\n")
            f.write(f"Number of Duplicate Rows: {dup}\n")
            f.write("-" * 50 + "\n\n\n")
            f.write("-" * 50 + "\n")
            f.write("Correlation Matrix (Numerical Columns):\n")
            f.write("-" * 50 + "\n")
            f.write(str(corr_matrix) + "\n")
            f.write("\n\n")
        
        print(f"Report successfully saved as '{report}'.")
     except Exception as e:
        print(f"Error: {e}")

    

In [3]:

def General_Analysis(df):
    print("\n" + "=" * 60)
    print("GENERAL ANALYSIS OF THE DATAFRAME".center(60))
    print("=" * 60 + "\n")
    
    pd.options.display.float_format = '{:,.2f}'.format
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 200)
    shape = df.shape
    print("-" * 60)
    print(f"Shape of DataFrame: {shape}")
    print("-" * 60 + "\n")
    print("COLUMN INFORMATION".center(60, "-"))
    buffer = StringIO()
    df.info(buf=buffer)
    info = buffer.getvalue()
    print(info)
    print("\n")
    print("MISSING VALUES IN EACH COLUMN".center(60, "-"))
    null_values = df.isnull().sum()
    print(null_values)
    print("\n")
    numdf = df.select_dtypes(include=['number']).fillna(0)
    print("DESCRIPTIVE STATISTICS (NUMERICAL)".center(60, "-"))
    des = numdf.describe()
    print(tabulate(des, headers='keys', tablefmt='psql'))
    print("\n")
    print("FIRST 5 ROWS OF THE DATAFRAME".center(60, "-"))
    head = df.head()
    print(tabulate(head, headers='keys', tablefmt='psql'))
    print("\n")
    print("LAST 5 ROWS OF THE DATAFRAME".center(60, "-"))
    tail = df.tail()
    print(tabulate(tail, headers='keys', tablefmt='psql'))
    print("\n")
    print("DUPLICATE ROWS".center(60, "-"))
    dup = df.duplicated().sum()
    print(f"Number of Duplicate Rows: {dup}")
    print("\n")
    print("CORRELATION MATRIX (NUMERICAL COLUMNS)".center(60, "-"))
    corr_matrix = numdf.corr()
    print(tabulate(corr_matrix, headers='keys', tablefmt='psql'))
    print("\n")
    rows, cols = corr_matrix.shape
    max_label_length = max(len(label) for label in corr_matrix.columns)
    fig_width = max(8, cols * 0.5, max_label_length * 0.5)
    fig_height = max(6, rows * 0.5)
    plt.figure(figsize=(fig_width, fig_height))
    plt.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
    plt.colorbar()
    plt.xticks(np.arange(cols), corr_matrix.columns, rotation=45, ha="right", fontsize=12)
    plt.yticks(np.arange(rows), corr_matrix.columns, rotation=0, fontsize=12)
    plt.title('Correlation Matrix Heatmap', fontsize=14)
    plt.tight_layout()
    plt.show()

    save = input('Save General Report? (Y/N): ')
    if save.lower()=="y":
        Save_Genreal_Report(corr_matrix, dup, head, tail, des, null_values, info, shape)



 
    


In [4]:


def save_custom(df, col1, col2):
    col_names = {col.lower(): col for col in df.columns}
    col1_actual = col_names.get(col1.lower())
    col2_actual = col_names.get(col2.lower())

    if not col1_actual or not col2_actual:
        print(f"❌ Error: One or both columns '{col1}' or '{col2}' not found in DataFrame.")
        return

    if not pd.api.types.is_numeric_dtype(df[col1_actual]) or not pd.api.types.is_numeric_dtype(df[col2_actual]):
        print(f"❌ Error: One or both selected columns are not numeric.")
        return

    df[col1_actual] = pd.to_numeric(df[col1_actual], errors='coerce')
    df[col2_actual] = pd.to_numeric(df[col2_actual], errors='coerce')
    df.dropna(inplace=True)  

    # Generate report text
    report_lines = [
        "=============================== Preview ================================\n",
        f"Data for {col1_actual}:\n{tabulate(df[col1_actual].head().to_frame(), headers='keys', tablefmt='psql')}\n",
        f"Data for {col2_actual}:\n{tabulate(df[col2_actual].head().to_frame(), headers='keys', tablefmt='psql')}\n",
        f"================================ Summary Statistics for {col1_actual} ======================================\n",
        tabulate(df[col1_actual].describe().to_frame().reset_index(), headers=['Statistic', col1_actual], tablefmt='psql') + "\n",
        f"================================ Summary Statistics for {col2_actual} ======================================\n",
        tabulate(df[col2_actual].describe().to_frame().reset_index(), headers=['Statistic', col2_actual], tablefmt='psql') + "\n",
        f"Correlation between {col1_actual} and {col2_actual}: {df[col1_actual].corr(df[col2_actual])}\n"
    ]

    
    plt.figure(figsize=(12, 10))


    plt.subplot(2, 2, 1)

    jitter_x = np.random.normal(0, (df[col1_actual].max() - df[col1_actual].min()) * 0.02, size=len(df[col1_actual]))
    jitter_y = np.random.normal(0, (df[col2_actual].max() - df[col2_actual].min()) * 0.02, size=len(df[col2_actual]))

    plt.scatter(df[col1_actual] + jitter_x, df[col2_actual] + jitter_y, alpha=0.4, edgecolor='k', linewidth=0.5)
    plt.xlabel(col1_actual)
    plt.ylabel(col2_actual)
    plt.title("Scatter Plot with Jitter")
    plt.grid(True, linestyle="--", alpha=0.5)

    

    plt.subplot(2, 2, 2)
    plt.plot(df.index, df[col2_actual].rolling(window=500, min_periods=1).mean(), linestyle='-')
    plt.xlabel(col1_actual)
    plt.ylabel(col2_actual)
    plt.title("Line Plot")

    plt.subplot(2, 2, 3)
    plt.hist(df[col1_actual], bins=30, alpha=0.7, density=True, color='blue', label=col1_actual)
    plt.hist(df[col2_actual], bins=30, alpha=0.7, density=True, color='red', label=col2_actual)
    plt.legend()
    plt.xlabel("Value")
    plt.ylabel("Density")
    plt.title("Histogram")

    plt.subplot(2, 2, 4)
    plt.imshow(df[[col1_actual, col2_actual]].corr(), cmap='coolwarm', vmin=-1, vmax=1)
    plt.colorbar()
    plt.xticks([0, 1], [col1_actual, col2_actual])
    plt.yticks([0, 1], [col1_actual, col2_actual])
    plt.title("Correlation Heatmap")

    plt.tight_layout()

    
    folder_path = input("Enter the full path or name of the folder to save the report and heatmap: ").strip()
    os.makedirs(folder_path, exist_ok=True)
    
    new_folder_path = os.path.join(folder_path, "report.txt")
   
    
    
    with open(new_folder_path, "w", encoding="utf-8") as file:
        file.writelines(report_lines)

    visualization_file = os.path.join(folder_path, "visualization.png")
    plt.savefig(visualization_file)
    plt.close()

    print(f"✅ Report saved at: {new_folder_path}")
    print(f"✅ Visualization saved at: {visualization_file}")


In [15]:
def Custom_Analysis(df, col1, col2, num):
    col_names = {col.lower(): col for col in df.columns}
    col1_actual = col_names.get(col1.lower())
    col2_actual = col_names.get(col2.lower())
    if not col1_actual or not col2_actual:
        print(f"❌ Error: One or both columns '{col1}' or '{col2}' not found in DataFrame.")
        return
    
    if not pd.api.types.is_numeric_dtype(df[col1_actual]):
        print(f"❌ Error: Column '{col1_actual}' is not numeric.")
        return
    
    if not pd.api.types.is_numeric_dtype(df[col2_actual]):
        print(f"❌ Error: Column '{col2_actual}' is not numeric.")
        return
    
    if(df[col1].isna().all() or (df[col1]=="").all()):
        print(f'column {col1} is completely Empty !')
        return
    if(df[col2].isna().all() or (df[col2]=="").all()):
        print(f'column {col2} is completely Empty !')

    print(df[col1_actual], "hello -1")
    print(df[col2_actual], "hello 0")
    df[col1_actual] = pd.to_numeric(df[col1_actual], errors='coerce')
    df[col2_actual] = pd.to_numeric(df[col2_actual], errors='coerce')
    
    # this wont work in diwali sales cause status and unamed are totally empty so it will delete all rows
    # df.dropna(inplace=True)  
    
    column1_data = df[col1_actual]
    column2_data = df[col2_actual]
    print(df[col1_actual], "hello 0.25")
    print(df[col2_actual], "hello 0.35")
    print(column1_data, "hello 0.5")
    print(column2_data, "hello 0.75")
    

    print("=============================== Preview ================================")
    column1_data_df = column1_data.head().to_frame()
    column2_data_df = column2_data.head().to_frame()
    print(column1_data_df, "hello 1")
    print(column2_data_df, "hello 2")
    print(f"Data for {col1_actual}:\n", tabulate(column1_data_df, headers='keys', tablefmt='psql'), "\n")
    print(f"Data for {col2_actual}:\n", tabulate(column2_data_df, headers='keys', tablefmt='psql'), "\n")
    print(f"================================ Summary Statistics for {col1_actual} ======================================")
    column1_stats = column1_data.describe()

    column1_stats_df = column1_stats.to_frame().reset_index()
    print(tabulate(column1_stats_df, headers=['Statistic', col1_actual], tablefmt='psql'), "\n")
    
    print(f"====================================== Summary Statistics for {col2_actual} ========================================")
    column2_stats = column2_data.describe()

    column2_stats_df = column2_stats.to_frame().reset_index()
    print(tabulate(column2_stats_df, headers=['Statistic', col2_actual], tablefmt='psql'), "\n")
    # print(column2_data.describe(), "\n")
    null_count_col1 = column1_data.isnull().sum()
    empty_count_col1 = (column1_data.eq('')).sum()
    null_count_col2 = column2_data.isnull().sum()
    empty_count_col2 = (column2_data.eq('')).sum()
    print(f"Null values in {col1_actual}: {null_count_col1}")
    print(f"Empty values in {col1_actual}: {empty_count_col1}\n")
    
    print(f"Null values in {col2_actual}: {null_count_col2}")
    print(f"Empty values in {col2_actual}: {empty_count_col2}\n")
    correlation = column1_data.corr(column2_data)
    print(f"Correlation between {col1_actual} and {col2_actual}: {correlation}\n")

    plt.figure(figsize=(12, 10))

   
    
   

    plt.subplot(2, 2, 1)



    jitter_x = np.random.normal(0, (df[col1_actual].max() - df[col1_actual].min()) * 0.02, size=len(df[col1_actual]))
    jitter_y = np.random.normal(0, (df[col2_actual].max() - df[col2_actual].min()) * 0.02, size=len(df[col2_actual]))

    plt.scatter(df[col1_actual] + jitter_x, df[col2_actual] + jitter_y, alpha=0.4, edgecolor='k', linewidth=0.5)
    plt.xlabel(col1_actual)
    plt.ylabel(col2_actual)
    plt.title("Scatter Plot with Jitter")
    plt.grid(True, linestyle="--", alpha=0.5)

    


    plt.subplot(2, 2, 2)

    rolling_window = 500  
    df['RollingMean'] = df[col2_actual].rolling(window=rolling_window, min_periods=1).mean()

    plt.plot(df.index, df['RollingMean'], linestyle='-')

    plt.xlabel(col1_actual)
    plt.ylabel(col2_actual)
    plt.title(f"Line Plot (Smoothed, Window={rolling_window})")
    plt.grid(True, linestyle='--', alpha=0.5)  



    plt.subplot(2, 2, 3)
    plt.hist(df[col1_actual], bins=30, alpha=0.7, density=True, color='darkblue', label=col1_actual)  
    plt.hist(df[col2_actual], bins=30, alpha=0.7, density=True, color='darkred', label=col2_actual)  
    plt.legend()
    plt.xlabel("Value")
    plt.ylabel("Density")
    plt.title("Histogram (Optimized)")



    plt.subplot(2, 2, 4)
    corr_matrix = df[[col1_actual, col2_actual]].corr()
    plt.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
    plt.colorbar()
    plt.xticks([0, 1], [col1_actual, col2_actual], rotation=45)
    plt.yticks([0, 1], [col1_actual, col2_actual])
    plt.title("Correlation Heatmap")

    plt.tight_layout()
    plt.show()

    save=input("Do you want to save this report ?")
    if(save.lower()=="y"):
        save_custom(df, col1, col2)



        

        


In [6]:


def save_string_report(df, column_name, output_folder="output", top_n=10):
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found in DataFrame.")
        return
    
    # os.makedirs(output_folder, exist_ok=True)  
    # report_path = os.path.join(output_folder, f"{column_name}_report.txt")  
    # img_path = os.path.join(output_folder, f"{column_name}_visualization.png")  

    
    value_counts = Counter(df[column_name].dropna())

    if len(value_counts) > top_n:
        most_common = dict(value_counts.most_common(top_n))
        others_count = sum(count for key, count in value_counts.items() if key not in most_common)
        most_common["Others"] = others_count
        value_counts = most_common

    labels = list(value_counts.keys())
    counts = list(value_counts.values())

    plt.figure(figsize=(14, 8))

    plt.subplot(2, 2, 1)
    plt.bar(labels, counts, color='darkblue')
    plt.xlabel("Category")
    plt.ylabel("Count")
    plt.title(f"Distribution of {column_name} (Top {top_n} Categories)")
    plt.xticks(rotation=45, ha="right")
    
    for i, count in enumerate(counts):
        plt.text(i, count + 0.1, str(count), ha='center', fontsize=12, fontweight='bold', color='black')

    plt.subplot(2, 2, 2)
    plt.pie(counts, labels=labels, autopct='%1.1f%%', colors=plt.cm.Paired.colors)
    plt.title(f"Distribution of {column_name} (Pie Chart)")

    plt.subplot(2, 2, 3)
    text = " ".join(df[column_name].dropna())
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis('off')
    plt.title(f"Word Cloud for {column_name}")

    plt.subplot(2, 2, 4)
    heatmap_data = np.array(counts).reshape(1, -1)
    plt.imshow(heatmap_data, aspect='auto', cmap='Blues')
    plt.colorbar(label="Count")
    plt.title(f"Heatmap for {column_name}")
    plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=90)

    plt.tight_layout()
    # plt.savefig(img_path)  
    


    folder_path = input("Enter the full path or name of the folder to save the report and heatmap: ").strip()
    os.makedirs(folder_path, exist_ok=True)
    
    new_folder_path = os.path.join(folder_path, "report.txt")

    with open(new_folder_path, "w", encoding="utf-8") as file:
        file.write(f"Report for Column: {column_name}\n")
        file.write("=" * 40 + "\n\n")
        file.write(f"Total Unique Values: {len(value_counts)}\n")
        file.write("\nTop Occurring Values:\n")

        for label, count in zip(labels, counts):
            file.write(f"{label}: {count}\n")

        # file.write("\nVisualization saved at: " + img_path + "\n")
    visualization_file = os.path.join(folder_path, "visualization.png")
    plt.savefig(visualization_file)
    plt.close()
    print(f"Report saved: {new_folder_path}")
    print(f"Visualization saved: {visualization_file}")


In [7]:
def plot_string_column(df, column_name, top_n=10):
    if column_name not in df.columns:
        print(f"Column '{column_name}' not found in DataFrame.")
        return
    
    value_counts = Counter(df[column_name].dropna())  

    if len(value_counts) > top_n:
        most_common = dict(value_counts.most_common(top_n))
        others_count = sum(count for _, count in value_counts.items() if _ not in most_common)
        most_common["Others"] = others_count
        
        value_counts = most_common

    labels = list(value_counts.keys())
    counts = list(value_counts.values())

    plt.figure(figsize=(14, 8))
    
    plt.subplot(2, 2, 1)  
    plt.bar(labels, counts, color='darkblue')
    plt.xlabel("Category")
    plt.ylabel("Count")
    plt.title(f"Distribution of {column_name} (Top {top_n} Categories)")

    for i, count in enumerate(counts):
        plt.text(i, count + 0.1, str(count), ha='center', fontsize=12, fontweight='bold', color='black')

    plt.subplot(2, 2, 2)  
    plt.pie(counts, labels=labels, autopct='%1.1f%%', colors=plt.cm.Paired.colors)
    plt.title(f"Distribution of {column_name} (Pie Chart)")

    
    plt.subplot(2, 2, 3)  
    if df[column_name].dtype == 'object':  
        text = " ".join(df[column_name].dropna())
        wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

        plt.imshow(wordcloud, interpolation="bilinear")
        plt.axis('off')
        plt.title(f"Word Cloud for {column_name}")
    else:
        plt.axis('off')  
    plt.subplot(2, 2, 4) 
   
    heatmap_data = np.array(counts).reshape(1, -1) 
    plt.imshow(heatmap_data, aspect='auto', cmap='Blues')
    plt.colorbar(label="Count")
    plt.title(f"Heatmap for {column_name}")
    plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=90)

    # Show the plot
    plt.tight_layout()
    plt.show()

    # Basic Analysis
    print(f"\n====================================== Basic Analysis of '{column_name}' ===========================================")
    print(f"Total unique values (Top {top_n} with 'Others'): {len(value_counts)}")
    for label, count in value_counts.items():
        print(f"{label}: {count} occurrences")
    choice=input("want to save this repiort (y/n) ?")
    if(choice.lower()=='y'):
        save_string_report(df, column_name, top_n)


In [8]:
def Analyze_Data(df):
    b=True
    while(b):
        print("1. General Analysis")
        print("2. Custom Analysis")
        print("3. Return")
        choice=input("Enter Choice: ").strip()
        if(choice=="1"):
           General_Analysis(df)
        elif(choice=="2"):
            b1=True
            while(b1):
                print("1. Analyze Two Numeric Columns")
                print("2. Analyze String Column")
                print("3. Return")
                choice=input("Enter Choice: ").strip()
                if(choice=="1"):
                  col1=input('Enter 1st Numeric Column')
                  col2=input('Enter 2nd Numeric Column')
                  Custom_Analysis(df, col1, col2, 1)
                  
                elif(choice=="2"):
                    # col1=input('Enter 1st Numeric Column')
                    col2=input('Enter String Column')
                    plot_string_column(df, col2)
                elif(choice=="3"):
                  b1=False
                else:
                  print("Please Enter Valid Choice !")
        elif(choice=="3"):
          # print("Exiting Program...")
          b=False
        else:
          print("Please Enter Valid Choice !")

In [9]:
def Load_Data():
    b=True
    while(b):
        print("1. Analyze CSV File")
        print("2. Analyze Parquet File")
        print("3. Analyze Excel File")
        print("4. Return")
        choice=input("Enter Choice: ").strip()
        if(choice=="1"):
          path = input("Enter File Path of CSV File: ")
          try:
              df = pd.read_csv(path, encoding='unicode_escape')
              return df
          except (FileNotFoundError, Exception) as e:
              print(f"Error: {e}")
            #   print("CSV File not found. Try again !")
               
        elif(choice=="2"):
              path = input("Enter File Path of Parquet File: ")
              try:
                  df = pd.read_parquet(path)
                  return df
              except (FileNotFoundError, Exception) as e:
                  print(f"Error: {e}")
                  print("Parquet File not found. Try again !")
        elif(choice=="3"):
              path = input("Enter File Path of Excel File: ")
              try:
                  df = pd.read_excel(path)
                  return df
              except (FileNotFoundError, Exception) as e:
                  print(f"Error: {e}")
                  print("Excel File not found. Try again !")
        elif(choice=="4"):
          # print("Exiting Program...")
          b=False
        else:
          print("Please Enter Valid Choice !")


In [10]:
def save_dataset(df):
    """Saves the dataset to a user-specified path."""
    save_path = input("Enter the path to save the edited CSV file: ")
    try:
        df.to_csv(save_path, index=False)
        print(f"\nDataset saved successfully at {save_path}!\n")
    except Exception as e:
        print(f"Error saving dataset: {e}")

In [11]:
def Edit_Data(df):
    b = True
    while b:
        print("1. Add a New Row")
        print("2. Update an Existing Row")
        print("3. Delete a Row")
        print("4. Edit Specific Column Data")
        print("5. Fill Missing Data")
        print("6. Rename Columns")
        print("7. Change Data Type of a Column")
        print("8. Sort Data")
        print("9. Filter Data")
        print("10. Remove Duplicates")
        print("11. Return")
        
        choice = input("Enter Choice: ")
        
        if choice == "1":
            print("\nAdding a new row:")
            new_row = {}
            for col in df.columns:
                new_row[col] = input(f"Enter value for {col}: ")
            df.loc[len(df)] = new_row 
            print("\nRow added successfully!")
        elif choice == "2":
            print("\nUpdating an existing row:")
            row_index = int(input("Enter row index to update: "))
            if 0 <= row_index < len(df):
                col_name = input("Enter column name to update: ")
                new_value = input("Enter new value: ")
                df.at[row_index, col_name] = new_value
                print("\nRow updated successfully!")
            else:
                print("\nInvalid row index!")
        elif choice == "3":
            print("\nDeleting a row:")
            row_index = int(input("Enter row index to delete: "))
            if 0 <= row_index < len(df):
                df = df.drop(index=row_index).reset_index(drop=True)
                print("\nRow deleted successfully!")
            else:
                print("\nInvalid row index!")
        elif choice == "4":
            print("\nEditing specific column data:")
            col_name = input("Enter column name: ")
            if col_name in df.columns:
                df[col_name] = df[col_name].apply(lambda x: input(f"Enter new value for {x}: "))
                print("\nColumn updated successfully!")
            else:
                print("\nInvalid column name!")
        elif choice == "5":
            print("\nFilling missing data:")
            fill_method = input("Enter 'mean', 'median', 'mode', or a specific value: ")
            if fill_method == "mean":
                df.fillna(df.mean(), inplace=True)
            elif fill_method == "median":
                df.fillna(df.median(), inplace=True)
            elif fill_method == "mode":
                df.fillna(df.mode().iloc[0], inplace=True)
            else:
                df.fillna(fill_method, inplace=True)
            print("\nMissing values filled successfully!")
        elif choice == "6":
            print("\nRenaming columns:")
            print("Current columns:", list(df.columns))
            old_col = input("Enter the column name to rename: ")
            new_col = input("Enter the new column name: ")
            df.rename(columns={old_col: new_col}, inplace=True)
            print("\nColumn renamed successfully!")
        elif choice == "7":
            print("\nChanging data type of a column:")
            col_name = input("Enter column name: ")
            new_type = input("Enter new data type (int, float, str): ")
            try:
                if new_type == "int":
                    df[col_name] = df[col_name].astype(int)
                elif new_type == "float":
                    df[col_name] = df[col_name].astype(float)
                elif new_type == "str":
                    df[col_name] = df[col_name].astype(str)
                print("\nData type changed successfully!")
            except Exception as e:
                print(f"Error changing data type: {e}")
        elif choice == "8":
            print("\nSorting data:")
            col_name = input("Enter column name to sort by: ")
            order = input("Enter 'asc' for ascending or 'desc' for descending: ")
            df = df.sort_values(by=col_name, ascending=(order == "asc"))
            print("\nData sorted successfully!")
        elif choice == "9":
           print("\nFiltering data:")
           col_name = input("Enter column name to filter by: ")

            # Check if column exists
           if col_name not in df.columns:
                print("Error: Column does not exist!")
           else:
                print("\nChoose filter type:")
                print("1. Equal to (==)")
                print("2. Greater than (>)")
                print("3. Smaller than (<)")
                filter_type = input("Enter choice (1/2/3): ")

                filter_value = input(f"Enter value to filter {col_name} by: ")

                try:
                    # Convert filter_value to correct type (int/float if possible)
                    if df[col_name].dtype in ['int64', 'float64']:  
                        filter_value = float(filter_value) if '.' in filter_value else int(filter_value)
                    
                    # Apply filtering based on user's choice
                    if filter_type == "1":
                        df = df[df[col_name] == filter_value]
                    elif filter_type == "2":
                        df = df[df[col_name] > filter_value]
                    elif filter_type == "3":
                        df = df[df[col_name] < filter_value]
                    else:
                        print("Invalid filter choice!")

                    print("\nData filtered successfully!")

                except ValueError:
                    print("Error: Filter value type does not match column type!")
        elif choice == "10":
            print("\nRemoving duplicates:")
            df.drop_duplicates(inplace=True)
            print("\nDuplicates removed successfully!")
        elif choice == "11":
            save_dataset(df)
            b = False
        else:
            print("Please Enter a Valid Choice!")


In [None]:
b=True
while(b):
    print("1. Analyze Data")
    print("2. Edit Data")
    print("3. Cleaning Data")
    print("4. Sort Folder")
    print("5. Exit")
    choice=input("Enter Choice: ")
    if(choice=="1"):
        df = Load_Data()
        Analyze_Data(df)
    elif(choice=="2"):
        df = Load_Data()
        Edit_Data(df)
        pass
    elif(choice=="3"):
        print(3)
        pass
    elif(choice=="4"):
        print(4)
    elif(choice=="5"):
          print("Exiting Program...")
          b=False
    else:
        print("Please Enter Valid Choice !")
# C:\\Users\\Shivam\\Downloads\\Diwali Sales Data.csv
# C:\\Users\\Shivam\\Downloads\\road_accident_dataset.csv
# D:\\shivam
