In [13]:
!pip install python-docx



In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from docx import Document
from docx.shared import Inches
import re

def load_data(file_path):
    if file_path.endswith('.csv'):
        data = pd.read_csv(file_path)
    elif file_path.endswith('.json'):
        data = pd.read_json(file_path)
    elif file_path.endswith('.xlsx'):
        data = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format")
    return data

def clean_data(data):
    data = data.dropna()  # Drop rows with missing values
    data = data.drop_duplicates()  # Remove duplicate rows
    return data

def preprocess_data(data):
    data = pd.get_dummies(data)  # Convert categorical variables into dummy/indicator variables
    return data

def plot_distribution(data, column):
    plt.figure(figsize=(10, 6))
    sns.histplot(data[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

def plot_heatmap(data):
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')

def generate_report(data, report_filename='analysis_report.docx'):
    document = Document()
    document.add_heading('Data Analysis Report', level=0)

    document.add_heading('Data Overview', level=1)
    document.add_paragraph(f"The dataset contains {data.shape[0]} rows and {data.shape[1]} columns.")
    document.add_paragraph(f"Data types: \n{data.dtypes}")

    document.add_heading('Descriptive Statistics', level=1)
    document.add_paragraph(f"Summary statistics: \n{data.describe()}")

    document.add_heading('Visualizations', level=1)

    for column in data.select_dtypes(include=['number']).columns:
        plot_distribution(data, column)
        plt.savefig('plot.png')
        document.add_picture('plot.png', width=Inches(6))
        plt.close()

    #for column in data.select_dtypes(include=['object']).columns:
    plt.figure(figsize=(8, 6))
    data[column].value_counts().plot.pie(autopct='%1.1f%%', colors=['skyblue', 'orange', 'green'], startangle=90, wedgeprops={'linewidth': 1, 'edgecolor': 'black'})
    plt.title(f'Proportion of {column}')
    plt.ylabel('')  # Removes the default label
    plt.savefig('plot.png')
    document.add_picture('plot.png', width=Inches(6))
    plt.close()

    plot_heatmap(data)
    plt.savefig('plot.png')
    document.add_picture('plot.png', width=Inches(6))
    plt.close()

    document.add_heading('Summary', level=1)
    document.add_paragraph("This report provides a basic overview of the data, including descriptive statistics and visualizations. "
                             "Further analysis can be performed to gain deeper insights.")

    document.save(report_filename)
    print(f"Report saved to {report_filename}")

def user_interface():
    print("Welcome to the AI Employee!")
    data = None
    while True:
        command = input("Enter your command (type 'exit' to quit, 'load data' to generate a report): ")
        if command.lower() == 'exit':
            break
        elif re.search("load data", command, re.IGNORECASE):
            file_path = input("Enter file path: ")
            data = load_data(file_path)
            print("Data loaded successfully.")
            if data is not None:
                data = clean_data(data)
                print("Data cleaned successfully.")

            if data is not None:
                data = preprocess_data(data)
                print("Data preprocessed successfully.")

            if data is not None:
                report_name = input("Enter report name: ")
                generate_report(data, report_name)
                print("Report generated successfully.")
            else:
                print("No data loaded.")
        else:
            print("Command not recognized.")

# Run the user interface
user_interface()


Welcome to the AI Employee!
Enter your command (type 'exit' to quit, 'load data' to generate a report): load data
Enter file path: /content/olympics2024.csv - olympics2024.csv.csv
Data loaded successfully.
Data cleaned successfully.
Data preprocessed successfully.
Enter report name: report4.docx
Report saved to report4.docx
Report generated successfully.
Enter your command (type 'exit' to quit, 'load data' to generate a report): exit
