In [1]:
from pymongo import MongoClient

In [2]:
client = MongoClient("mongodb://mongo:27017/")

In [3]:
client.list_database_names()

['admin', 'config', 'local']

In [7]:
# show a database to connect to.
# to create a databse from mongo, $ use <name> will create the database if it doens't exist
db = client["example"]

In [13]:
# once we have switched into a databse, we can create a collection with
# $ db.createCollection("students")
#db.createCollection("students")

# to list all collections from pymongo
#db.list_collection_names()
students = db["students"]
db.list_collection_names() # collections are not created until we actually add data

[]


In [6]:
# list
# we pass an empty dictionary to indicate that we want all registers.
# in the case we want to count only students with a filter, we pass it as a dict.
# nr students: students.count_documents({}) 

In [14]:
# insert new record (just past a python dict/ json object)
students.insert_one({"id":1, "name" : "seba", "country" : "cl"})

# insert multiple records
# data_to_insert = [
# { dict 1},
# { dict 2},
# etc
#]
#students.insert_many(data_to_insert)

<pymongo.results.InsertOneResult at 0x7f58a4057c80>

In [15]:
# now that we have added data, we can list the collection
db.list_collection_names()

['students']

In [10]:
# selection and retrieval
# we use find() to retrieve the records
# students.find_one() # for a single one that matches some criteria
# students.find() # for all records
# for student in students.find():
#  do something with student

# to perform a ""select_columns"", we can pass as 2nd argument to find a dict with booleans for the ""column"" names
# for student in students.find({}, {"name":1, "country":0}):
#  do something with student. # student var has only name data but no contry

# to find a "matching record", especify the matching value as 1st parameter
# for student in students.find({"country":"cl"}, {"name":1, "country":0}):
#  do something with student. # student belong to cl country

# we can specify more complex matching patterns as regex (we pass them as dict)
# for student in students.find({"name":{"$regex":"^seb"}):
#  do something with student. # student's name starts with 'seb'

In [None]:
# update a record

# update multiple records at once
# query = {"country":{"$regex":"^MEX"}}
# values = {"$set":{"country":"^MX"}} # replace MEX -> MX
# result = students.update_many(query, values)

In [4]:
# pandas section
import pandas as pd

df = pd.read_csv("../datasets/OnlineRetail.csv")

In [5]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [17]:
df.groupby(['InvoiceNo'])

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541894,581587,22631,CIRCUS PARADE LUNCH BOX,12,12/9/2011 12:50,1.95,12680.0,France
541895,581587,22556,PLASTERS IN TIN CIRCUS PARADE,12,12/9/2011 12:50,1.65,12680.0,France
541896,581587,22555,PLASTERS IN TIN STRONGMAN,12,12/9/2011 12:50,1.65,12680.0,France
541897,581587,22728,ALARM CLOCK BAKELIKE PINK,4,12/9/2011 12:50,3.75,12680.0,France


In [37]:
# which product sold the most PER-INVOICE
# df.groupby(['InvoiceNo'])[["StockCode", "Quantity"]].max()

# which product sold the most (total nr of items sold)
aggregated_by_quantity = df.groupby(["StockCode"]).agg({'Quantity': 'sum'})

# nr items
print(f"nr different items: {aggregated_by_quantity.shape[0]}")
aggregated_by_quantity.head()



nr different items: 4070


Unnamed: 0_level_0,Quantity
StockCode,Unnamed: 1_level_1
10002,1037
10080,495
10120,193
10123C,-13
10123G,-38


In [39]:
# be careful. idxmax doesn't return the "index" (numeric value) but the "label" of that entry.
# In this case, the value return is not an index but a particular StockCode
aggregated_by_quantity.idxmax()

Quantity    22197
dtype: object