# Business Analyst Portfolio Project
This notebook performs exploratory data analysis (EDA) and builds predictive models using a synthetic sales dataset.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

sns.set(style='whitegrid')


In [None]:

# Load the synthetic dataset
file_path = 'synthetic_sales_data.csv'
data = pd.read_csv(file_path, parse_dates=['OrderDate'])

# Display first few rows
data.head()


In [None]:

# Display summary statistics
summary = data.describe(include='all')
summary


In [None]:

# Plot average profit by region
plt.figure(figsize=(8, 5))
avg_profit_region = data.groupby('Region')['Profit'].mean().sort_values(ascending=False)
sns.barplot(x=avg_profit_region.index, y=avg_profit_region.values)
plt.title('Average Profit by Region')
plt.xlabel('Region')
plt.ylabel('Average Profit')
plt.tight_layout()
plt.show()


In [None]:

# Plot distribution of units sold
plt.figure(figsize=(8,5))
sns.histplot(data['UnitsSold'], bins=20, kde=True)
plt.title('Distribution of Units Sold')
plt.xlabel('Units Sold')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()


In [None]:

# Scatter plot of Marketing Spend vs Profit
plt.figure(figsize=(8,5))
sns.scatterplot(x='MarketingSpend', y='Profit', data=data)
plt.title('Marketing Spend vs Profit')
plt.xlabel('Marketing Spend')
plt.ylabel('Profit')
plt.tight_layout()
plt.show()


In [None]:

# Prepare data for regression modeling
# We'll predict profit using numerical and categorical variables
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

X = data[['Region', 'CustomerSegment', 'ProductCategory', 'UnitCost', 'UnitPrice', 'UnitsSold', 'MarketingSpend']]
y = data['Profit']

# Define categorical and numerical columns
categorical_cols = ['Region', 'CustomerSegment', 'ProductCategory']
numerical_cols = ['UnitCost', 'UnitPrice', 'UnitsSold', 'MarketingSpend']

# Preprocess data: one-hot encode categorical variables and pass through without change numeric variables
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# Define linear regression model pipeline
linreg_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Train the model
linreg_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = linreg_model.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression MSE: {mse_lr:.2f}")
print(f"Linear Regression R^2: {r2_lr:.2f}")


In [None]:

# Define random forest model pipeline
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest MSE: {mse_rf:.2f}")
print(f"Random Forest R^2: {r2_rf:.2f}")


In [None]:

# Conclusion
print("
The synthetic dataset provides a snapshot of sales-related metrics. Through exploratory analysis, we observe patterns such as average profits across regions, distribution of units sold, and the relationship between marketing spend and profit. The predictive models demonstrate that both linear regression and random forest can estimate profits based on the available features.
")
