# MySQL via Shell and Python

# Installing and Importing Libraries

In [None]:
!apt-get update > /dev/null 2>&1
!apt-get install -y mysql-server > /dev/null 2>&1

In [None]:
!usermod -d /var/lib/mysql mysql

In [None]:
!service mysql start

 * Starting MySQL database server mysqld
   ...done.


In [None]:
# test for MySQL service
!service mysql status

 * /usr/bin/mysqladmin  Ver 8.0.42-0ubuntu0.22.04.1 for Linux on x86_64 ((Ubuntu))
Copyright (c) 2000, 2025, Oracle and/or its affiliates.

Oracle is a registered trademark of Oracle Corporation and/or its
affiliates. Other names may be trademarks of their respective
owners.

Server version		8.0.42-0ubuntu0.22.04.1
Protocol version	10
Connection		Localhost via UNIX socket
UNIX socket		/var/run/mysqld/mysqld.sock
Uptime:			2 sec

Threads: 2  Questions: 8  Slow queries: 0  Opens: 119  Flush tables: 3  Open tables: 38  Queries per second avg: 4.000


In [None]:
# Create .my.cnf for password-based authentication
!rm -f ~/.my.cnf /root/.my.cnf                                     # clear if existing
!echo -e "[client]\nuser=root\npassword=pw" > ~/.my.cnf            # print text to CLI
!chmod 600 ~/.my.cnf                                               # grants read/write permissions to file owner

In [None]:
# Check if sudo mysql can connect
!mysql -N -e "SELECT 1;" || echo "Failed to connect"

+---+
| 1 |
+---+


In [None]:
!sudo mysql -e "ALTER USER 'root'@'localhost' IDENTIFIED WITH mysql_native_password BY 'pw'; FLUSH PRIVILEGES;"

In [None]:
!mysql -e "SHOW DATABASES;"

+--------------------+
| Database           |
+--------------------+
| information_schema |
| mysql              |
| performance_schema |
| sys                |
+--------------------+


# Importing a Sample Database

In [None]:
!mysql < mysqlsampledatabase.sql

In [None]:
!mysql -e "SHOW DATABASES;"

+--------------------+
| Database           |
+--------------------+
| classicmodels      |
| information_schema |
| mysql              |
| performance_schema |
| sys                |
+--------------------+


# Interacting with Text and CSV Files

In [None]:
!mysqldump -u root -ppw classicmodels > text_db.txt
!sed -i 's/`classicmodels`/`text_db`/g' text_db.txt



In [None]:
!mysql -u root -ppw -e "CREATE DATABASE text_db;"
!mysql -u root -ppw text_db < text_db.txt



In [None]:
!mysql -e "SHOW DATABASES;"

+--------------------+
| Database           |
+--------------------+
| classicmodels      |
| information_schema |
| mysql              |
| performance_schema |
| sys                |
| text_db            |
+--------------------+


In [None]:
!pip install pymysql > /dev/null 2>&1

In [None]:
from sqlalchemy import create_engine, text
import pandas as pd
import os

# connection setup
engine = create_engine(f"mysql+pymysql://root:pw@localhost/classicmodels")

# check if 'csv_db' folder exists, create if not
output_dir = 'csv_db'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")

# get list of tables
with engine.connect() as conn:
    result = conn.execute(text("SHOW TABLES;"))
    tables = [row[0] for row in result]

# loop through tables and export each to the 'csv_db' directory
for table in tables:
    df = pd.read_sql(f"SELECT * FROM `{table}`", engine)
    file_path = os.path.join(output_dir, f"{table}.csv")
    df.to_csv(file_path, index=False)

print(f"Export complete. All CSV files are in the '{output_dir}' directory.")

Created directory: csv_db
Export complete. All CSV files are in the 'csv_db' directory.


# Inspecting the Data

In [None]:
database = 'classicmodels'

In [None]:
!mysql -e "USE {database}; SHOW TABLES;"

+-------------------------+
| Tables_in_classicmodels |
+-------------------------+
| customers               |
| employees               |
| offices                 |
| orderdetails            |
| orders                  |
| payments                |
| productlines            |
| products                |
+-------------------------+


In [None]:
!mysql -e "USE {database}; DESCRIBE customers;"

+------------------------+---------------+------+-----+---------+-------+
| Field                  | Type          | Null | Key | Default | Extra |
+------------------------+---------------+------+-----+---------+-------+
| customerNumber         | int           | NO   | PRI | NULL    |       |
| customerName           | varchar(50)   | NO   |     | NULL    |       |
| contactLastName        | varchar(50)   | NO   |     | NULL    |       |
| contactFirstName       | varchar(50)   | NO   |     | NULL    |       |
| phone                  | varchar(50)   | NO   |     | NULL    |       |
| addressLine1           | varchar(50)   | NO   |     | NULL    |       |
| addressLine2           | varchar(50)   | YES  |     | NULL    |       |
| city                   | varchar(50)   | NO   |     | NULL    |       |
| state                  | varchar(50)   | YES  |     | NULL    |       |
| postalCode             | varchar(15)   | YES  |     | NULL    |       |
| country                | varchar(50)

In [None]:
!mysql -e "USE {database}; SHOW INDEXES FROM customers;"

+-----------+------------+------------------------+--------------+------------------------+-----------+-------------+----------+--------+------+------------+---------+---------------+---------+------------+
| Table     | Non_unique | Key_name               | Seq_in_index | Column_name            | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
+-----------+------------+------------------------+--------------+------------------------+-----------+-------------+----------+--------+------+------------+---------+---------------+---------+------------+
| customers |          0 | PRIMARY                |            1 | customerNumber         | A         |         122 |     NULL |   NULL |      | BTREE      |         |               | YES     | NULL       |
| customers |          1 | salesRepEmployeeNumber |            1 | salesRepEmployeeNumber | A         |          16 |     NULL |   NULL | YES  | BTREE      |         |     

In [None]:
query = """
SELECT table_name, table_rows
FROM information_schema.tables
WHERE table_schema = 'classicmodels'
"""

!mysql -e "{query}"

+--------------+------------+
| TABLE_NAME   | TABLE_ROWS |
+--------------+------------+
| customers    |        122 |
| employees    |         23 |
| offices      |          7 |
| orderdetails |       2996 |
| orders       |        326 |
| payments     |        273 |
| productlines |          7 |
| products     |        110 |
+--------------+------------+


In [None]:
!mysql -e "SELECT table_name, COUNT(*) AS column_count FROM information_schema.columns WHERE table_schema = '{database}' GROUP BY table_name;"

+--------------+--------------+
| TABLE_NAME   | column_count |
+--------------+--------------+
| customers    |           13 |
| employees    |            8 |
| offices      |            9 |
| orderdetails |            5 |
| orders       |            7 |
| payments     |            4 |
| productlines |            4 |
| products     |            9 |
+--------------+--------------+


# Magic Commands

In [None]:
!pip install mysql-connector-python
%load_ext sql

Collecting mysql-connector-python
  Downloading mysql_connector_python-9.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Downloading mysql_connector_python-9.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (33.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.9/33.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mysql-connector-python
Successfully installed mysql-connector-python-9.3.0


In [None]:
%sql mysql+mysqlconnector://root:pw@localhost/classicmodels

In [None]:
%config SqlMagic.style = '_DEPRECATED_DEFAULT'
%config SqlMagic.autopandas = True

In [None]:
%%sql

SELECT TABLE_NAME, COLUMN_NAME, COLUMN_TYPE
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = 'classicmodels'
ORDER BY TABLE_NAME, ORDINAL_POSITION
LIMIT 5;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
5 rows affected.


Unnamed: 0,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE
0,customers,customerNumber,int
1,customers,customerName,varchar(50)
2,customers,contactLastName,varchar(50)
3,customers,contactFirstName,varchar(50)
4,customers,phone,varchar(50)


In [None]:
# capture SQL result in a Python variable
result = %sql SELECT \
TABLE_NAME, COLUMN_NAME, COLUMN_TYPE \
FROM information_schema.COLUMNS \
WHERE TABLE_SCHEMA = 'classicmodels' \
ORDER BY TABLE_NAME, ORDINAL_POSITION;

# inspect top rows with pandas
result.head()

 * mysql+mysqlconnector://root:***@localhost/classicmodels
59 rows affected.


Unnamed: 0,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE
0,customers,customerNumber,int
1,customers,customerName,varchar(50)
2,customers,contactLastName,varchar(50)
3,customers,contactFirstName,varchar(50)
4,customers,phone,varchar(50)


In [None]:
# capture SQL result in a Python variable
%%sql result <<
SELECT
TABLE_NAME, COLUMN_NAME, COLUMN_TYPE
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = 'classicmodels'
ORDER BY TABLE_NAME, ORDINAL_POSITION;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
59 rows affected.
Returning data to local variable result


In [None]:
# inspect top rows with pandas
result.head()

Unnamed: 0,TABLE_NAME,COLUMN_NAME,COLUMN_TYPE
0,customers,customerNumber,int
1,customers,customerName,varchar(50)
2,customers,contactLastName,varchar(50)
3,customers,contactFirstName,varchar(50)
4,customers,phone,varchar(50)


# Creating a Table

In [None]:
%%sql

CREATE TABLE IF NOT EXISTS simple_table (  -- will not attempt creation if table exists
    first_name VARCHAR(50),
    last_name VARCHAR(50)
);

INSERT IGNORE INTO simple_table (first_name, last_name) -- will ignore if the records exist
VALUES ("Alice", "Anderson"),
       ("Bob", "Browning");

 * mysql+mysqlconnector://root:***@localhost/classicmodels
0 rows affected.
2 rows affected.


In [None]:
%%sql
SELECT * FROM simple_table;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
2 rows affected.


Unnamed: 0,first_name,last_name
0,Alice,Anderson
1,Bob,Browning


In [None]:
!mysql -e "USE classicmodels; SELECT * FROM simple_table;"

+------------+-----------+
| first_name | last_name |
+------------+-----------+
| Alice      | Anderson  |
| Bob        | Browning  |
+------------+-----------+


# Constructing Field Lists

In [None]:
!mysql -e "USE classicmodels; SHOW TABLES;" | tail -n +2 | paste -sd, -

customers,employees,offices,orderdetails,orders,payments,productlines,products,simple_table


In [None]:
!mysql -e "USE classicmodels; SHOW TABLES;" | tail -n +2 | paste -sd, - | sed 's/,/, /g'

customers, employees, offices, orderdetails, orders, payments, productlines, products, simple_table


In [None]:
!mysql -e "USE classicmodels; SHOW TABLES;" | tail -n +2 | awk 'ORS=(NR%5==0) ? "\n" : ", "'

customers, employees, offices, orderdetails, orders
payments, productlines, products, simple_table, 

In [None]:
import mysql.connector

In [None]:
def list_table_columns(host, user, password, database, table):
  conn = mysql.connector.connect(host=host,
                                  user=user,
                                  password=password,
                                  database=database)

  # Create a cursor object to interact with the database
  cursor = conn.cursor()

  # Execute the DESCRIBE query
  str = "DESCRIBE " + table + ";"
  cursor.execute(str)

  # Fetch all results
  columns = cursor.fetchall()

  # Extract column names into a list
  column_names = [column[0] for column in columns]

  # Print the column names with a new line after every 5 items
  for i in range(0, len(column_names), 5):
      # Join the next 5 column names with commas and print
      print(', '.join(column_names[i:i+5]))

  # Close the cursor and connection
  cursor.close()
  conn.close()

In [None]:
# !mysql -e "GRANT ALL PRIVILEGES ON classicmodels.* TO 'root'@'localhost'; FLUSH PRIVILEGES;"

In [None]:
host = "localhost"
user = "root"
password = "pw"
database = "classicmodels"
table = 'offices'
list_table_columns(host, user, password, database, table)

officeCode, city, phone, addressLine1, addressLine2
state, country, postalCode, territory


In [None]:
table = 'customers'
list_table_columns(host, user, password, database, table)

customerNumber, customerName, contactLastName, contactFirstName, phone
addressLine1, addressLine2, city, state, postalCode
country, salesRepEmployeeNumber, creditLimit


In [None]:
table = 'employees'
list_table_columns(host, user, password, database, table)

employeeNumber, lastName, firstName, extension, email
officeCode, reportsTo, jobTitle


In [None]:
table = 'payments'
list_table_columns(host, user, password, database, table)

customerNumber, checkNumber, paymentDate, amount


# Query Examples

In [None]:
%%sql

SELECT
    COUNT(amount) AS count,
    AVG(amount) AS average,
    MIN(amount) AS minimum,
    MAX(amount) AS maximum,
    SUM(amount) AS total
FROM payments;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
1 rows affected.


Unnamed: 0,count,average,minimum,maximum,total
0,273,32431.645531,615.45,120166.58,8853839.23


In [None]:
%%sql

SELECT
    customerNumber,
    SUM(amount) as amount
 FROM payments
 GROUP BY customerNumber;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
98 rows affected.


Unnamed: 0,customerNumber,amount
0,103,22314.36
1,112,80180.98
2,114,180585.07
3,119,116949.68
4,121,104224.79
...,...,...
93,486,77726.59
94,487,42570.37
95,489,29586.15
96,495,65541.74


In [None]:
%%sql

SELECT
    customers.customerNumber,
    customers.customerName,
    SUM(payments.amount) as amount
FROM customers
INNER JOIN payments ON customers.customerNumber = payments.customerNumber
GROUP BY customers.customerNumber, customers.customerName
ORDER BY amount DESC;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
98 rows affected.


Unnamed: 0,customerNumber,customerName,amount
0,141,Euro+ Shopping Channel,715738.98
1,124,Mini Gifts Distributors Ltd.,584188.24
2,114,"Australian Collectors, Co.",180585.07
3,151,Muscle Machine Inc,177913.95
4,148,"Dragon Souveniers, Ltd.",156251.03
...,...,...,...
93,381,Royale Belge,29217.18
94,473,Frau da Collezione,25358.32
95,103,Atelier graphique,22314.36
96,198,Auto-Moto Classics Inc.,21554.26


In [None]:
%%sql

SELECT
    offices.officeCode,
    COUNT(DISTINCT customers.customerNumber) AS customer_count,
    COUNT(DISTINCT employees.employeeNumber) AS employee_count,
    COUNT(DISTINCT payments.amount) AS payment_count,
    CONVERT(SUM(payments.amount), SIGNED) AS payment_total
FROM offices
LEFT JOIN employees ON offices.officeCode = employees.officeCode
LEFT JOIN customers ON employees.employeeNumber = customers.salesRepEmployeeNumber
LEFT JOIN payments ON customers.customerNumber = payments.customerNumber
                   AND YEAR(payments.paymentDate) = 2003
GROUP BY offices.officeCode
ORDER BY payment_total DESC;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
7 rows affected.


Unnamed: 0,officeCode,customer_count,employee_count,payment_count,payment_total
0,4,29,5,30,969960
1,1,12,6,15,532681
2,7,17,2,16,505385
3,3,15,2,14,391176
4,2,12,2,9,301781
5,6,10,4,11,281986
6,5,5,2,5,267249


In [None]:
%sql DROP VIEW IF EXISTS payment_view_with_dates;

%sql CREATE VIEW payment_view_with_dates AS \
SELECT \
    p.*, \
    YEAR(p.paymentDate) AS payment_year, \
    MONTH(p.paymentDate) AS payment_month, \
    WEEK(p.paymentDate, 0) AS payment_week, \
    DAYOFWEEK(p.paymentDate) AS payment_dayofweek \
FROM payments p;

result = %sql SELECT * FROM payment_view_with_dates;
result.head()

 * mysql+mysqlconnector://root:***@localhost/classicmodels
0 rows affected.
 * mysql+mysqlconnector://root:***@localhost/classicmodels
0 rows affected.
 * mysql+mysqlconnector://root:***@localhost/classicmodels
273 rows affected.


Unnamed: 0,customerNumber,checkNumber,paymentDate,amount,payment_year,payment_month,payment_week,payment_dayofweek
0,103,HQ336336,2004-10-19,6066.78,2004,10,42,3
1,103,JM555205,2003-06-05,14571.44,2003,6,22,5
2,103,OM314933,2004-12-18,1676.14,2004,12,50,7
3,112,BO864823,2004-12-17,14191.12,2004,12,50,6
4,112,HQ55022,2003-06-06,32641.98,2003,6,22,6


In [None]:
import mysql.connector
import pandas as pd

# Connect to your MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="pw",
    database="classicmodels"
)

cursor = conn.cursor()

# Step 1: Manually define pivot columns for known years
manual_columns = """
    SUM(CASE WHEN YEAR(paymentDate) = 2003 THEN amount ELSE 0 END) AS `2003`,
    SUM(CASE WHEN YEAR(paymentDate) = 2004 THEN amount ELSE 0 END) AS `2004`,
    SUM(CASE WHEN YEAR(paymentDate) = 2005 THEN amount ELSE 0 END) AS `2005`
"""

# Step 2: Build and execute the final SQL query
sql = f"""
    SELECT customerNumber AS customer, {manual_columns}
    FROM payments
    GROUP BY customer
    ORDER BY customer;
"""
cursor.execute(sql)

# Step 3: Fetch column names and results
columns = [desc[0] for desc in cursor.description]
rows = cursor.fetchall()

# Step 4: Create a pandas DataFrame
df = pd.DataFrame(rows, columns=columns)

# Clean up
cursor.close()
conn.close()

# Display the result
df.head()

Unnamed: 0,customer,2003,2004,2005
0,103,14571.44,7742.92,0.0
1,112,32641.98,47539.0,0.0
2,114,53429.11,127155.96,0.0
3,119,0.0,67426.01,49523.67
4,121,51710.33,52514.46,0.0


In [None]:
import mysql.connector
import pandas as pd

# Connect to your MySQL database
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="pw",
    database="classicmodels"
)

cursor = conn.cursor()

# Step 1: Increase group_concat limit (optional but helpful for many years)
cursor.execute("SET SESSION group_concat_max_len = 100000;")

# Step 2: Build dynamic column definitions
cursor.execute("""
    SELECT GROUP_CONCAT(
        DISTINCT CONCAT(
            'SUM(CASE WHEN YEAR(paymentDate) = ', YEAR(paymentDate),
            ' THEN amount ELSE 0 END) AS `', YEAR(paymentDate), '`'
        )
        ORDER BY YEAR(paymentDate)
    )
    FROM payments;
""")
pivot_columns = cursor.fetchone()[0]

# Step 3: Build and execute full dynamic SQL
sql = f"""
    SELECT customerNumber AS customer, {pivot_columns}
    FROM payments
    GROUP BY customer
    ORDER BY customer;
"""
cursor.execute(sql)

# Step 4: Fetch column names and results
columns = [desc[0] for desc in cursor.description]
rows = cursor.fetchall()

# Step 5: Create a pandas DataFrame
df = pd.DataFrame(rows, columns=columns)

cursor.close()
conn.close()

df.head()

Unnamed: 0,customer,2003,2004,2005
0,103,14571.44,7742.92,0.0
1,112,32641.98,47539.0,0.0
2,114,53429.11,127155.96,0.0
3,119,0.0,67426.01,49523.67
4,121,51710.33,52514.46,0.0


In [None]:
conn = mysql.connector.connect(
    host="localhost",
    user="root",
    password="pw",
    database="classicmodels"
)

query = """
    SELECT customerNumber, payment_year, amount
    FROM payment_view_with_dates;
"""
df_raw = pd.read_sql(query, conn)

# Optional: Rename for readability
df_raw.rename(columns={'customerNumber': 'customer'}, inplace=True)

# Step 2: Create pivot using pandas
df_pivot = pd.pivot_table(
    df_raw,
    index='customer',
    columns='payment_year',
    values='amount',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Optional: Rename column headers if desired (e.g., keep years as they are)
df_pivot.columns.name = None  # Remove index name from columns

# Close connection
conn.close()

df_pivot.head()

  df_raw = pd.read_sql(query, conn)


Unnamed: 0,customer,2003,2004,2005
0,103,14571.44,7742.92,0.0
1,112,32641.98,47539.0,0.0
2,114,53429.11,127155.96,0.0
3,119,0.0,67426.01,49523.67
4,121,51710.33,52514.46,0.0


In [None]:
%%sql

SELECT
    o.officeCode AS office,
    e.employeeNumber AS salesRep,
    c.customerNumber,
    c.customerName,
    SUM(p.amount) AS totalPayments
FROM offices o
INNER JOIN employees e ON o.officeCode = e.officeCode
INNER JOIN customers c ON e.employeeNumber = c.salesRepEmployeeNumber
INNER JOIN payments p ON c.customerNumber = p.customerNumber
GROUP BY o.officeCode, e.employeeNumber, c.customerNumber, c.customerName
ORDER BY o.officeCode, e.employeeNumber, c.customerNumber;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
98 rows affected.


Unnamed: 0,office,salesRep,customerNumber,customerName,totalPayments
0,1,1165,124,Mini Gifts Distributors Ltd.,584188.24
1,1,1165,129,Mini Wheels Co.,66710.56
2,1,1165,161,Technics Stores Inc.,104545.22
3,1,1165,321,Corporate Gift Ideas Co.,132340.78
4,1,1165,450,The Sharp Gifts Warehouse,59551.38
...,...,...,...,...,...
93,7,1504,189,"Clover Collections, Co.",49898.27
94,7,1504,259,"Toms Spezialitäten, Ltd",89223.14
95,7,1504,299,"Norway Gifts By Mail, Co.",69059.04
96,7,1504,415,"Bavarian Collectables Imports, Co.",31310.09


In [None]:
%%sql

SELECT
    officeCode,
    customerNumber,
    employeeNumber,
    customerName,
    ROUND(SUM(amount), 0) AS amount,
    ROUND(SUM(amount) OVER (PARTITION BY officeCode), 0) AS 'total payments for office',
    RANK() OVER (PARTITION BY officeCode ORDER BY SUM(amount) DESC) AS 'rank in office',
    ROUND(SUM(amount) OVER (PARTITION BY employeeNumber), 0) AS 'total payments for employee',
    RANK() OVER (PARTITION BY employeeNumber ORDER BY SUM(amount) DESC) AS 'rank for employee',
    ROUND(SUM(amount) OVER (PARTITION BY customerNumber), 0) AS 'total payments for customer',
    ROUND((SUM(amount) / SUM(amount) OVER (PARTITION BY officeCode)) * 100, 2) AS 'percent of office total',
    ROUND((SUM(amount) / SUM(amount) OVER (PARTITION BY employeeNumber)) * 100, 2) AS 'percent of employee total'
FROM (
    SELECT
        o.officeCode,
        c.customerNumber,
        e.employeeNumber,
        c.customerName,
        SUM(p.amount) AS amount
    FROM offices o
    INNER JOIN employees e ON o.officeCode = e.officeCode
    INNER JOIN customers c ON e.employeeNumber = c.salesRepEmployeeNumber
    INNER JOIN payments p ON c.customerNumber = p.customerNumber
    GROUP BY o.officeCode, e.employeeNumber, c.customerNumber, c.customerName
) AS aggregated_data

GROUP BY officeCode, employeeNumber, customerNumber, customerName
ORDER BY officeCode, employeeNumber, customerNumber;

 * mysql+mysqlconnector://root:***@localhost/classicmodels
98 rows affected.


Unnamed: 0,officeCode,customerNumber,employeeNumber,customerName,amount,total payments for office,rank in office,total payments for employee,rank for employee,total payments for customer,percent of office total,percent of employee total
0,1,124,1165,Mini Gifts Distributors Ltd.,584188,1337440,1,989907,1,584188,43.68,59.01
1,1,129,1165,Mini Wheels Co.,66711,1337440,7,989907,4,66711,4.99,6.74
2,1,161,1165,Technics Stores Inc.,104545,1337440,3,989907,3,104545,7.82,10.56
3,1,321,1165,Corporate Gift Ideas Co.,132341,1337440,2,989907,2,132341,9.90,13.37
4,1,450,1165,The Sharp Gifts Warehouse,59551,1337440,8,989907,5,59551,4.45,6.02
...,...,...,...,...,...,...,...,...,...,...,...,...
93,7,189,1504,"Clover Collections, Co.",49898,1324326,14,637673,7,49898,3.77,7.83
94,7,259,1504,"Toms Spezialitäten, Ltd",89223,1324326,7,637673,3,89223,6.74,13.99
95,7,299,1504,"Norway Gifts By Mail, Co.",69059,1324326,12,637673,6,69059,5.21,10.83
96,7,415,1504,"Bavarian Collectables Imports, Co.",31310,1324326,16,637673,9,31310,2.36,4.91


# Integrating Machine Learning

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Get data
df = %sql SELECT \
    products.productCode, \
    productlines.productLine, \
    products.productName, \
    CONCAT(products.productName, ' ', productlines.productLine) as name_line_concat \
FROM products \
INNER JOIN productlines \
ON products.productLine = productlines.productLine

# Load pre-trained BERT model for sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(df['name_line_concat'].tolist())
df['embedding'] = list(embeddings)

# K-Means clustering
n_clusters = min(6, len(df))
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)
df['cluster'] = cluster_labels

# UMAP dimensionality reduction
umap_reducer = umap.UMAP(n_components=3, random_state=42)
embeddings_3d = umap_reducer.fit_transform(embeddings)

# Plotly 3D scatter with hover labels
palette = sns.color_palette("husl", n_clusters)
palette_hex = [f'rgb({int(c[0]*255)},{int(c[1]*255)},{int(c[2]*255)})' for c in palette]

fig = go.Figure()

for cluster_id in range(n_clusters):
    mask = df['cluster'] == cluster_id
    fig.add_trace(
        go.Scatter3d(
            x=embeddings_3d[mask, 0],
            y=embeddings_3d[mask, 1],
            z=embeddings_3d[mask, 2],
            mode='markers',
            marker=dict(size=8, color=palette_hex[cluster_id], opacity=0.6),
            name=f'Cluster {cluster_id}',
            text=df.loc[mask, 'productName'],  # Hover text
            hoverinfo='text'
        )
    )

fig.update_layout(
    title='3D UMAP Projection of Product Embeddings with K-Means Clusters',
    scene=dict(
        xaxis_title='UMAP Component 1',
        yaxis_title='UMAP Component 2',
        zaxis_title='UMAP Component 3',
        xaxis=dict(showspikes=False),
        yaxis=dict(showspikes=False),
        zaxis=dict(showspikes=False)
    ),
    showlegend=True,
    width=800,
    height=600
)

fig.write_html('product_similarity_umap.html')
fig.show()

 * mysql+mysqlconnector://root:***@localhost/classicmodels
110 rows affected.



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
# run the above cell twice if the chart doesn't appear the first time