### Import Required Libraries
Import the necessary libraries, such as pandas, numpy, and matplotlib.

In [None]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 100)  # Set the maximum number of rows to display
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed

# Enable inline plotting for matplotlib
%matplotlib inline

# Load the Dataset
Load the dataset into a pandas DataFrame.

In [None]:
# Load the Dataset
data = pd.read_csv('../output/advisorRecommendations.csv')  # Replace with your dataset path

# filter where category is 'Cost' and impact is 'Low'
# data[(data['category'] == 'Cost') & (data['impact'] == 'Low')].head

data.head()  # Display the first few rows of the dataset

# Explore the Dataset
Perform initial exploration of the dataset, including checking for missing values and basic statistics.

In [None]:
# Explore the Dataset

# Check for missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Display basic statistics of the dataset
basic_stats = data.describe()
print("\nBasic statistics of the dataset:\n", basic_stats)

# Display the data types of each column
data_types = data.dtypes
print("\nData types of each column:\n", data_types)

# Display the shape of the dataset
data_shape = data.shape
print("\nShape of the dataset:", data_shape)

### Analysis 1 - Recommendations per Category

In [None]:
# Prompt: Create a summary of of the dataset with the columns: Category, HighPriority, MediumPriority, LowPriority, and TotalRecommendations. Also add a row at the end that shows the sum of each column.

# Create a summary DataFrame
summary_df = data.groupby('category').agg(
    HighPriority=('impact', lambda x: (x == 'High').sum()),
    MediumPriority=('impact', lambda x: (x == 'Medium').sum()),
    LowPriority=('impact', lambda x: (x == 'Low').sum())
).reset_index()

# Calculate the total recommendations for each category
summary_df['TotalRecommendations'] = summary_df[['HighPriority', 'MediumPriority', 'LowPriority']].sum(axis=1)

# Add a row at the end that shows the sum of each column
total_row = pd.DataFrame(summary_df[['HighPriority', 'MediumPriority', 'LowPriority', 'TotalRecommendations']].sum()).T
total_row['category'] = 'Total'
summary_df = pd.concat([summary_df, total_row], ignore_index=True)

# Display the summary DataFrame
summary_df


In [None]:
# Define colors for each impact level
colors = {'High': 'red', 'Medium': 'orange', 'Low': 'blue'}

# Filter summary_df to remove the row where category is 'Total'
category_impact_counts = summary_df[summary_df['category'] != 'Total']

category_impact_counts = category_impact_counts.set_index('category')[['HighPriority', 'MediumPriority', 'LowPriority']]

# Reorder the columns to ensure the order of impact is High, Medium, and Low
category_impact_counts = category_impact_counts[['HighPriority', 'MediumPriority', 'LowPriority']]

# Plot the data with horizontal bars
category_impact_counts.plot(kind='barh', stacked=True, figsize=(13, 4), color=[colors[col.split('Priority')[0]] for col in category_impact_counts.columns])
plt.title('Count per Category and per Impact')
plt.xlabel('Count')
plt.ylabel('Category')
plt.legend(title='Impact')
plt.show()

# clean up the memory category_impact_counts
del category_impact_counts

### Analysis for Cost Recommendations

In [None]:
# Create a summary DataFrame
summary_cost_df = data[data['category'] == 'Cost']



In [None]:
# Group by impact and calculate the sum of annualSavingsAmount
print("Total annual savings amount for each impact level:")
summary_cost_df.groupby('impact')['annualSavingsAmount'].sum()

In [None]:
# Group by "impact" and "problem" and sum the "annualSavingsAmount"
grouped_summary = summary_cost_df.groupby(['impact', 'problem'])['annualSavingsAmount'].sum().reset_index()

# Format "annualSavingsAmount" as money
grouped_summary['annualSavingsAmount'] = grouped_summary['annualSavingsAmount'].apply(lambda x: "${:,.2f}".format(x))

# Calculate the total savings percentage for each problem
grouped_summary['TotalSavingsPct'] = (grouped_summary['annualSavingsAmount'].str.replace('$', '').str.replace(',', '').astype(float) / grouped_summary['annualSavingsAmount'].str.replace('$', '').str.replace(',', '').astype(float).sum()) * 100

# Sort the grouped summary by the TotalSavingsPct column in descending order
grouped_summary = grouped_summary.sort_values(by='TotalSavingsPct', ascending=False)

# Display the updated grouped summary
grouped_summary

### Recommendations per Subscription

In [None]:
# Create a dataframe that counts all the different recommendation categories per subscription
subscription_category_counts = data.groupby(['subscriptionName', 'category']).size().unstack(fill_value=0).reset_index()

# Add a column TotalRecommendations at the end
numeric_columns = subscription_category_counts.select_dtypes(include=[np.number]).columns
subscription_category_counts['TotalRecommendations'] = subscription_category_counts[numeric_columns].sum(axis=1)

# Sort by TotalRecommendations
subscription_category_counts = subscription_category_counts.sort_values(by='TotalRecommendations', ascending=False)

# drop column category
# subscription_category_counts.drop(columns='category', inplace=True, axis=1)

# Display the dataframe
subscription_category_counts.head(100)