# OK Here We Go
So now we start doing stuff

In [1]:
import pandas as pd
import openpyxl
from pathlib import Path

In [2]:
input_file = Path.cwd()/'data'/'raw'/'sample_sales.xlsx'

In [3]:
df = pd.read_excel(input_file,engine='openpyxl')

## Basic details

In [4]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [5]:
df.tail()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
995,ZM-628-88,Viva,2019-09-11,pen,-5,34,-170
996,DQ-810-46,Viva,2019-09-05,pen,17,32,544
997,RA-147-40,Dabfeed,2019-03-24,poster,17,34,578
998,VT-754-54,Photobean,2019-12-30,shirt,15,18,270
999,LS-463-74,Mybuzz,2019-11-12,shirt,24,34,816


In [6]:
df.columns

Index(['invoice', 'company', 'purchase_date', 'product', 'quantity', 'price',
       'extended amount'],
      dtype='object')

In [7]:
df.shape

(1000, 7)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   product          1000 non-null   object        
 4   quantity         1000 non-null   int64         
 5   price            1000 non-null   int64         
 6   extended amount  1000 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 54.8+ KB


In [9]:
df.describe()

Unnamed: 0,quantity,price,extended amount
count,1000.0,1000.0,1000.0
mean,22.421,22.816,510.27
std,16.246714,7.537039,426.411667
min,-5.0,10.0,-170.0
25%,9.0,16.0,168.75
50%,22.0,23.0,435.0
75%,36.0,29.0,798.5
max,50.0,35.0,1715.0


## Extracting a column

In [10]:
df['invoice']

0      ZN-870-29
1      JQ-501-63
2      FI-165-58
3      XP-005-55
4      NB-917-18
         ...    
995    ZM-628-88
996    DQ-810-46
997    RA-147-40
998    VT-754-54
999    LS-463-74
Name: invoice, Length: 1000, dtype: object

## Operations on columns

In [11]:
df['quantity'].sum()

22421

In [12]:
df['product'].nunique()

4

In [13]:
df['product'].value_counts()

shirt     271
poster    269
book      234
pen       226
Name: product, dtype: int64

In [14]:
df[['price','quantity']].mean()

price       22.816
quantity    22.421
dtype: float64

## Adding & populating a field

### Constant

In [15]:
df['Country']='Australia'

In [16]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,Country
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,Australia
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420,Australia
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161,Australia
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203,Australia
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,Australia


### Calculation

In [17]:
df['Fee']=df['extended amount']*0.15

In [18]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount,Country,Fee
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,Australia,48.45
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420,Australia,63.0
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161,Australia,24.15
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203,Australia,30.45
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,Australia,102.6


## Cleaning up column names

In [19]:
df.columns=[c.replace(' ','_') for c in df.columns]

In [20]:
df.columns=[c.lower() for c in df.columns]

In [21]:
df.columns

Index(['invoice', 'company', 'purchase_date', 'product', 'quantity', 'price',
       'extended_amount', 'country', 'fee'],
      dtype='object')

## Targetting cells/set of cells using loc

In [22]:
df.loc[3,:]

invoice                      XP-005-55
company                       Skipfire
purchase_date      2019-11-18 00:00:00
product                            pen
quantity                             7
price                               29
extended_amount                    203
country                      Australia
fee                              30.45
Name: 3, dtype: object

In [23]:
df.loc[[0,2,4],:]

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended_amount,country,fee
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323,Australia,48.45
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161,Australia,24.15
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684,Australia,102.6


In [24]:
df.loc[:,'company']

0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         Viva
996         Viva
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object

In [25]:
df.loc[0:2,['company','product']]

Unnamed: 0,company,product
0,Realcube,shirt
1,Zooxo,book
2,Dabtype,poster


Note that iloc is like loc but by index number (and note the Python style ranges, end index of range definition not included in the range)

In [26]:
df.iloc[1:3,3:5]

Unnamed: 0,product,quantity
1,book,30
2,poster,7


## Preferred way to make copies of things

In [27]:
sub_df = df.iloc[0:3,0:4].copy()

In [28]:
sub_df

Unnamed: 0,invoice,company,purchase_date,product
0,ZN-870-29,Realcube,2019-03-05,shirt
1,JQ-501-63,Zooxo,2019-07-09,book
2,FI-165-58,Dabtype,2019-08-12,poster
