In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# Read data
lakehouse_path = "Files/Bronze/FabPredictRetail_Final.csv"
df = spark.read.csv(lakehouse_path, header=True, inferSchema=True).toPandas()

# Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract year and quarter from Date
df['year'] = df['Date'].dt.year
df['quarter'] = df['Date'].dt.quarter

# Filter top 3 categories
top_categories = df['Product_Category'].value_counts().head(3).index
df = df[df['Product_Category'].isin(top_categories)]

# Calculate sales
df['sales'] = df['Quantity'] * df['Price']

# Aggregate sales by quarter and category
df = df.groupby(['year', 'quarter', 'Product_Category']).agg({'sales': 'sum'}).reset_index()

# Prepare X (features: year, quarter, category) and y (target: sales)
X = df[['year', 'quarter', 'Product_Category']]
y = df['sales']

# Encode categorical variables
X = pd.get_dummies(X, columns=['Product_Category'])

# Ensure the columns are sorted
X = X.reindex(sorted(X.columns), axis=1)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Initialize XGBoost regressor
model = XGBRegressor()

# Fit the model
model.fit(X_train, y_train)

import numpy as np

# Create a dataframe with future quarters for 2024 and top categories
future_data = []
for category in top_categories:
    for quarter in range(1, 5):  # Q1 to Q4
        future_data.append({'year': 2024, 'quarter': quarter, 'Product_Category': category})

future_df = pd.DataFrame(future_data)

# Encode categorical variables for prediction
future_df = pd.get_dummies(future_df, columns=['Product_Category'])

# Ensure the columns are sorted and match the training data
future_df = future_df.reindex(columns=X_train.columns, fill_value=0)

# Make predictions for 2024
predictions = model.predict(future_df)

# Organize predictions by category and quarter
results = {}
for i, category in enumerate(top_categories):
    results[category] = predictions[i*4:(i+1)*4]

# Print or use results
for category, preds in results.items():
    print(f"Predicted sales for {category} in 2024:")
    for quarter, pred in zip(['Q1', 'Q2', 'Q3', 'Q4'], preds):
        print(f"{quarter}: {pred}")
    print()

In [None]:
# Read data
lakehouse_path = "Files/Bronze/FabPredictRetail_Final.csv"
df = spark.read.csv(lakehouse_path, header=True, inferSchema=True).toPandas()

# Calculate correlation coefficients for Age and Quantity by Gender
genders = ['Male', 'Female', 'Other']
correlations = {}

for gender in genders:
    gender_data = df[df['Gender'] == gender]
    if not gender_data.empty:
        corr, _ = pearsonr(gender_data['Age'], gender_data['Quantity'])
        correlations[gender] = corr
    else:
        correlations[gender] = None  # Handle cases with no data for a gender

for gender, corr in correlations.items():
    print(f"Correlation coefficient for {gender.lower()}s: {corr}")