%md
<div class="alert alert-block alert-info">
<center> <h1> Customer Segmentation and Sales Forecast</h1> </center> <br>
<center> Big Data Analytics 2025</center><br>
<center> NOVA IMS MDSAA</center>

### [NOTE]
In this project, we are going to work on 3 notebooks:
- 1. **Data Preprocessing**: For EDA, Data Preprocessing, Creating DataFrames, and Feature Engineering
- 2. **Clustering**: For clustering 
- 3. **Sales Forecasting**: For Sales Forecast. <br>
##### **This notebook is 2. Clustering.**

%md
# Group 77

|   | Student Name          |  Student ID | 
|---|-----------------------|    ---      |
| 1 | Hassan Bhatti       |  20241023 |
| 2 | Moeko Mitani          |   20240670  | 
| 3 | Oumayma Ben Hfaiedh   |   20240699  | 
| 4 | Ricardo Pereira      |  20240745  | 

# 1. Data Integration 

## Import Libraries

In [0]:
# ─────────────────────────────────────────────
# Spark Core
# ─────────────────────────────────────────────
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import (
    col, lit, to_timestamp, to_date, year, month, dayofmonth,
    count, countDistinct, sum, avg, min, max,
    round, when, datediff, current_date, concat_ws,
    monotonically_increasing_id
)
from pyspark.sql.functions import max as spark_max

# ─────────────────────────────────────────────
# Spark MLlib
# ─────────────────────────────────────────────
from pyspark.ml.feature import (
    VectorAssembler, PCA, StringIndexer, StandardScaler,
    MinMaxScaler
)
from pyspark.ml.clustering import KMeans
from pyspark.ml.stat import Correlation
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline

# ─────────────────────────────────────────────
# Python Built-ins and Data Science Libraries
# ─────────────────────────────────────────────
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
from sklearn.metrics import (
    silhouette_samples, silhouette_score,
    confusion_matrix
)

# ─────────────────────────────────────────────
# Utilities
# ─────────────────────────────────────────────
from itertools import combinations




In [0]:
# Start Spark session
spark = SparkSession.builder.appName("Project_Group77").getOrCreate()

## Import CSV File (from previous Notebook: 1. Data Preprocessing)

In [0]:
# File location and type
file_location = "/FileStore/df_final.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_final = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df_final.limit(10).display()
df_final.schema


Customer ID,num_invoices,num_products,total_quantity,total_price,avg_unit_price,first_purchase_date,last_purchase_date,purchase_span_days,avg_quantity_per_invoice,recency_days,2022-12,2023-1,2023-10,2023-11,2023-12,2023-2,2023-3,2023-4,2023-5,2023-6,2023-7,2023-8,2023-9,2024-1,2024-10,2024-11,2024-12,2024-2,2024-3,2024-4,2024-5,2024-6,2024-7,2024-8,2024-9,prediction
13285,6,182,2457,3364.59,2.33,2023-03-25,2024-11-16,602,409.5,23,0,0,0,0,0,0,21,23,0,0,0,0,0,0,0,50,0,52,0,30,0,0,55,0,0,1
14570,3,64,431,613.75,3.29,2023-09-22,2024-03-04,164,143.67,280,0,0,0,0,0,0,0,0,0,0,0,0,39,0,0,0,0,0,29,0,0,0,0,0,0,2
15846,1,27,79,107.01,1.82,2023-11-19,2023-11-19,0,79.0,386,0,0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
25462,1,158,278,1281.03,6.11,2024-05-24,2024-05-24,0,278.0,199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,158,0,0,0,0,1
17420,8,43,444,943.68,3.41,2023-11-03,2024-10-20,352,55.5,50,0,0,0,22,7,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,14,0,0,0,1
24171,1,13,22,43.25,2.49,2024-03-18,2024-03-18,0,22.0,266,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,2
16386,3,127,700,1068.16,2.05,2023-11-17,2024-11-11,360,233.33,28,0,0,0,58,0,0,0,0,0,0,0,0,0,0,0,77,0,0,4,0,0,0,0,0,0,1
18024,3,21,148,236.78,2.66,2024-07-10,2024-07-10,0,49.33,152,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,1
15727,15,426,5908,9371.71,3.57,2023-01-24,2024-11-23,669,393.87,16,0,99,0,50,66,0,0,0,87,13,116,19,0,32,0,39,0,0,0,46,14,0,30,74,0,1
16339,1,17,21,94.05,4.96,2024-02-28,2024-02-28,0,21.0,285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,2


Out[9]: StructType([StructField('Customer ID', IntegerType(), True), StructField('num_invoices', IntegerType(), True), StructField('num_products', IntegerType(), True), StructField('total_quantity', IntegerType(), True), StructField('total_price', DoubleType(), True), StructField('avg_unit_price', DoubleType(), True), StructField('first_purchase_date', DateType(), True), StructField('last_purchase_date', DateType(), True), StructField('purchase_span_days', IntegerType(), True), StructField('avg_quantity_per_invoice', DoubleType(), True), StructField('recency_days', IntegerType(), True), StructField('2022-12', IntegerType(), True), StructField('2023-1', IntegerType(), True), StructField('2023-10', IntegerType(), True), StructField('2023-11', IntegerType(), True), StructField('2023-12', IntegerType(), True), StructField('2023-2', IntegerType(), True), StructField('2023-3', IntegerType(), True), StructField('2023-4', IntegerType(), True), StructField('2023-5', IntegerType(), True), Struct

In [0]:
# In Databricks
df_graph = df_final.select("Customer ID", "prediction")

df_graph.limit(10).display()
df_graph.schema

Customer ID,prediction
13285,1
14570,2
15846,2
25462,1
17420,1
24171,2
16386,1
18024,1
15727,1
16339,2


Out[11]: StructType([StructField('Customer ID', IntegerType(), True), StructField('prediction', IntegerType(), True)])

In [0]:
# Save to DBFS
df_graph.toPandas().to_csv('/tmp/customer_clusters.csv', index=False)

In [0]:
~/Neo4j/your-project-name/import/customer_clusters.csv

[0;36m  File [0;32m<command-3617033263597512>:1[0;36m[0m
[0;31m    ~/Neo4j/your-project-name/import/customer_clusters.csv[0m
[0m     ^[0m
[0;31mSyntaxError[0m[0;31m:[0m invalid syntax
