## Initialise and read the data file

In [1]:
import pandas as pd
import openpyxl
from pathlib import Path
input_file = Path.cwd()/'data'/'raw'/'sample_sales.xlsx'
df = pd.read_excel(input_file,engine='openpyxl')
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [2]:
df.columns=[c.replace(' ','_') for c in df.columns]
df.columns=[c.lower() for c in df.columns]

In [3]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


## Dataframe Operations

**Date operations** - Can do month, year, dayofweek, days_in_month, is_months_end

In [4]:
df['purchase_date'].dt.days_in_month

0      31
1      31
2      31
3      30
4      30
       ..
995    30
996    30
997    31
998    31
999    30
Name: purchase_date, Length: 1000, dtype: int64

Adding this date operation as a new column

In [5]:
df['day_of_week']=df['purchase_date'].dt.dayofweek

In [6]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,1
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420,1
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161,0
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203,0
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,3


**String operations** - can do upper, lower, title, len

In [7]:
df['product'].str.upper()

0       SHIRT
1        BOOK
2      POSTER
3         PEN
4      POSTER
        ...  
995       PEN
996       PEN
997    POSTER
998     SHIRT
999     SHIRT
Name: product, Length: 1000, dtype: object

**Math operations**

In [8]:
df['price']*0.9

0      15.3
1      12.6
2      20.7
3      26.1
4      17.1
       ... 
995    30.6
996    28.8
997    30.6
998    16.2
999    30.6
Name: price, Length: 1000, dtype: float64

## Boolean Indexing

Make a Boolean series out of the df

In [9]:
viva = df['company']=='Viva'

In [10]:
viva

0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996     True
997    False
998    False
999    False
Name: company, Length: 1000, dtype: bool

Then use df.loc to get the df applying that Boolean

In [11]:
df.loc[viva,:]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
186,CM-205-86,Viva,2019-05-19,book,8,14,112,6
705,QU-986-45,Viva,2019-06-09,book,6,35,210,6
840,RF-796-61,Viva,2019-02-19,poster,46,16,736,1
995,ZM-628-88,Viva,2019-09-11,pen,-5,34,-170,2
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544,3


Which is the same as

In [12]:
df[viva]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
186,CM-205-86,Viva,2019-05-19,book,8,14,112,6
705,QU-986-45,Viva,2019-06-09,book,6,35,210,6
840,RF-796-61,Viva,2019-02-19,poster,46,16,736,1
995,ZM-628-88,Viva,2019-09-11,pen,-5,34,-170,2
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544,3


Let's do another Boolean for quantity > 10

In [13]:
q_10=df['quantity']>10

In [14]:
q_10

0       True
1       True
2      False
3      False
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: quantity, Length: 1000, dtype: bool

In [15]:
df[q_10]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,1
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420,1
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,3
6,MQ-907-02,Babbleset,2019-10-27,poster,30,21,630,6
7,NX-102-26,Fliptune,2019-10-16,book,40,28,1120,2
...,...,...,...,...,...,...,...,...
994,ON-247-90,Photobug,2019-09-29,shirt,40,31,1240,6
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544,3
997,RA-147-40,Dabfeed,2019-03-24,poster,17,34,578,6
998,VT-754-54,Photobean,2019-12-30,shirt,15,18,270,0


And now let's mix the two, company Viva, quantity > 10

In [16]:
df[viva & q_10]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
840,RF-796-61,Viva,2019-02-19,poster,46,16,736,1
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544,3


Which can also be done somewhat less elegantly but somewhat more generally as

In [17]:
df.loc[viva & q_10,:]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
840,RF-796-61,Viva,2019-02-19,poster,46,16,736,1
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544,3


## Random mind blowing command
Count the number of products with invoice code starting with 'S'

In [18]:
df.loc[df['invoice'].str.startswith('S'),'product'].value_counts()

pen       10
shirt     10
poster     7
book       6
Name: product, dtype: int64

## Dates

Purchased after 12th Jan

In [19]:
df[df['purchase_date']>='12-1-2019']

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
32,MX-599-18,Vimbo,2019-12-12,book,36,33,1188,3
33,NN-163-23,Skyble,2019-12-27,pen,45,12,540,4
34,FP-453-25,Zava,2019-12-05,poster,0,13,0,3
54,SI-328-34,Babbleblab,2019-12-13,shirt,3,34,102,4
57,JF-213-00,Centimia,2019-12-22,pen,6,17,102,6
...,...,...,...,...,...,...,...,...
951,HY-069-99,Oba,2019-12-12,poster,18,25,450,3
956,WO-091-32,Eire,2019-12-05,shirt,15,20,300,3
966,FO-483-27,Cogibox,2019-12-10,poster,25,15,375,1
985,DX-716-83,Aimbo,2019-12-17,pen,38,15,570,1


Books purchased in November

In [20]:
november = df['purchase_date'].dt.month == 11
book = df['product'] == 'book'
df[november & book]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
16,IG-811-84,Yadel,2019-11-15,book,46,28,1288,4
51,MM-603-72,Snaptags,2019-11-14,book,13,25,325,3
67,DB-217-08,Dynabox,2019-11-17,book,17,26,442,6
77,TL-625-84,Snaptags,2019-11-02,book,28,33,924,5
148,UE-108-04,Yodo,2019-11-24,book,29,27,783,6
194,CP-917-92,Kamba,2019-11-13,book,41,16,656,2
248,LZ-428-00,Photojam,2019-11-22,book,0,25,0,4
303,PE-226-53,Gabvine,2019-11-22,book,19,18,342,4
338,EP-683-72,Npath,2019-11-22,book,1,15,15,4
528,QP-747-38,Ainyx,2019-11-19,book,9,31,279,1


In [21]:
small_orders = df['quantity']<5

In [22]:
df[small_orders]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
5,MI-696-11,Zooveo,2019-10-17,pen,-1,30,-30,3
8,LE-516-00,Buzzbean,2019-06-17,poster,-3,16,-48,0
19,JU-472-72,Babbleblab,2019-10-23,shirt,-1,13,-13,2
24,BB-457-52,Mynte,2019-11-25,pen,3,25,75,0
27,BS-100-95,DabZ,2019-01-23,pen,2,26,52,2
...,...,...,...,...,...,...,...,...
980,NB-919-33,Roomm,2019-09-23,pen,-1,33,-33,0
981,HN-685-51,Dabfeed,2019-03-25,poster,0,13,0,0
984,ZS-595-22,Reallinks,2019-06-24,pen,2,16,32,0
986,CX-078-12,Jamia,2019-11-06,shirt,-3,15,-45,2


Variation: using df.loc and limiting the columns

In [23]:
df.loc[small_orders, 'company':'product']

Unnamed: 0,company,purchase_date,product
5,Zooveo,2019-10-17,pen
8,Buzzbean,2019-06-17,poster
19,Babbleblab,2019-10-23,shirt
24,Mynte,2019-11-25,pen
27,DabZ,2019-01-23,pen
...,...,...,...
980,Roomm,2019-09-23,pen
981,Dabfeed,2019-03-25,poster
984,Reallinks,2019-06-24,pen
986,Jamia,2019-11-06,shirt


Same as before, but instead of using Boolean indexing, using df.query

In [24]:
df.query('quantity < 5')

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,day_of_week
5,MI-696-11,Zooveo,2019-10-17,pen,-1,30,-30,3
8,LE-516-00,Buzzbean,2019-06-17,poster,-3,16,-48,0
19,JU-472-72,Babbleblab,2019-10-23,shirt,-1,13,-13,2
24,BB-457-52,Mynte,2019-11-25,pen,3,25,75,0
27,BS-100-95,DabZ,2019-01-23,pen,2,26,52,2
...,...,...,...,...,...,...,...,...
980,NB-919-33,Roomm,2019-09-23,pen,-1,33,-33,0
981,HN-685-51,Dabfeed,2019-03-25,poster,0,13,0,0
984,ZS-595-22,Reallinks,2019-06-24,pen,2,16,32,0
986,CX-078-12,Jamia,2019-11-06,shirt,-3,15,-45,2
