In [1]:
import numpy as np
import pandas as pd

In [9]:
## Display version of pandas
pd.__version__

'1.0.3'

### In this tutorial we cover
* Read data from a file,
* Rename Columns,
* Reverse rows of a dataframe,
* Reverse Columns order
* select_dtypes, filters

#### Read data from a file

In [161]:
d = pd.read_csv("players.csv"
            , skiprows=0
           ,nrows=5
           , usecols=['Name', 'Country']
            , na_values=['.', '?', '-']
            #, parse_dates=['dob']
           )
d

Unnamed: 0,Name,Country
0,Bradman,Aus
1,Sobers,WI
2,Sangakarra,SL
3,Tendulkar,Ind
4,Lara,WI


In [None]:
players.to_csv('Cricketers.csv', index=False)

In [38]:
## Create a DataFrame

indx = pd.date_range(start='01-June-2020', periods=10, freq='D')
data = np.random.randint(low=5, high=20, size=(10,3))
df = pd.DataFrame(data=data, index=indx, columns=['A B', 'C D', 'E F'])

df.head()

Unnamed: 0,A B,C D,E F
2020-06-01,6,16,17
2020-06-02,14,5,12
2020-06-03,19,17,18
2020-06-04,6,11,13
2020-06-05,16,17,10


#### Rename Columns

In [39]:
df.columns = df.columns.str.replace(' ','&')

In [40]:
df.head()

Unnamed: 0,A&B,C&D,E&F
2020-06-01,6,16,17
2020-06-02,14,5,12
2020-06-03,19,17,18
2020-06-04,6,11,13
2020-06-05,16,17,10


#### Add Prefix or Suffix to Column names

In [41]:
df.add_prefix('Company_').head(3)

Unnamed: 0,Company_A&B,Company_C&D,Company_E&F
2020-06-01,6,16,17
2020-06-02,14,5,12
2020-06-03,19,17,18


In [42]:
df.add_suffix('_Company').head(3)

Unnamed: 0,A&B_Company,C&D_Company,E&F_Company
2020-06-01,6,16,17
2020-06-02,14,5,12
2020-06-03,19,17,18


#### Reverse rows of a dataframe

In [43]:
df.loc[::-1].head(3)

Unnamed: 0,A&B,C&D,E&F
2020-06-10,8,13,5
2020-06-09,11,14,12
2020-06-08,7,9,17


#### Reverse Columns order

In [44]:
df.loc[:, ::-1].head(3)

Unnamed: 0,E&F,C&D,A&B
2020-06-01,17,16,6
2020-06-02,12,5,14
2020-06-03,18,17,19


#### Select columns by data types

In [163]:
data = {'Player': ['Bradman', 'Sobers', 'Sangakarra', 'Tendulkar', 'Lara', 'Kallis', 'Ponting','Gavaskar','Dravid'],
        'Matches':[52, 93, 134, 200, 131, 166, 168, 125, 164],
        'Bat_Avg':[99.94, 57.78, 57.40, 53.78, 52.88, 55.37, 51.85, 51.12, 52.31],
        'Centuries':[29, 26, 38, 51, 34, 45, 41, 34, 36 ],
        'Runs':[6996, 8032, 12400, 15921, 11953, 13289, 13378, 10122, 13288]
       }

indx = ['Aus', 'WI', 'SriLanka', 'Ind', 'WI', 'SA', 'Aus', 'Ind', 'Ind']

players = pd.DataFrame(data = data, index=indx)
players

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
WI,Sobers,93,57.78,26,8032
SriLanka,Sangakarra,134,57.4,38,12400
Ind,Tendulkar,200,53.78,51,15921
WI,Lara,131,52.88,34,11953
SA,Kallis,166,55.37,45,13289
Aus,Ponting,168,51.85,41,13378
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


In [164]:
players.dtypes

Player        object
Matches        int64
Bat_Avg      float64
Centuries      int64
Runs           int64
dtype: object

In [165]:
players.select_dtypes(include=['number']).head()

Unnamed: 0,Matches,Bat_Avg,Centuries,Runs
Aus,52,99.94,29,6996
WI,93,57.78,26,8032
SriLanka,134,57.4,38,12400
Ind,200,53.78,51,15921
WI,131,52.88,34,11953


In [167]:
players.select_dtypes(exclude=['number']).head()

Unnamed: 0,Player
Aus,Bradman
WI,Sobers
SriLanka,Sangakarra
Ind,Tendulkar
WI,Lara


#### Strings to Numbers

In [100]:
df = pd.DataFrame({'name': ['Avi', 'Ravi', 'Kavi', 'Tavi'],
                   'age': ['16', 18, 20, 24],
                  'height_cm':[145, 150, '-', '160'],
                  'weight_kg':['?', 60, 65, '-']})
df

Unnamed: 0,name,age,height_cm,weight_kg
0,Avi,16,145,?
1,Ravi,18,150,60
2,Kavi,20,-,65
3,Tavi,24,160,-


In [101]:
df.dtypes

name         object
age          object
height_cm    object
weight_kg    object
dtype: object

In [102]:
df.astype({'age':'int'})

Unnamed: 0,name,age,height_cm,weight_kg
0,Avi,16,145,?
1,Ravi,18,150,60
2,Kavi,20,-,65
3,Tavi,24,160,-


In [103]:
# df.astype({'age':'int', 'height_cm':'int'}) 
# above will give error because of dash 

pd.to_numeric(df.height_cm, errors='coerce')

0    145.0
1    150.0
2      NaN
3    160.0
Name: height_cm, dtype: float64

In [104]:
pd.to_numeric(df.weight_kg, errors='coerce')

0     NaN
1    60.0
2    65.0
3     NaN
Name: weight_kg, dtype: float64

In [105]:
pd.to_numeric(df.weight_kg, errors='coerce').fillna(0)

0     0.0
1    60.0
2    65.0
3     0.0
Name: weight_kg, dtype: float64

In [112]:
df.set_index('name', inplace=True)

In [113]:
df

Unnamed: 0_level_0,age,height_cm,weight_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avi,16,145,?
Ravi,18,150,60
Kavi,20,-,65
Tavi,24,160,-


In [115]:
df = df.apply(pd.to_numeric, errors='coerce')
df

Unnamed: 0_level_0,age,height_cm,weight_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avi,16,145.0,
Ravi,18,150.0,60.0
Kavi,20,,65.0
Tavi,24,160.0,


#### memory usage

In [75]:
players.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, Aus to Ind
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Player     9 non-null      object 
 1   Matches    9 non-null      int64  
 2   Bat_Avg    9 non-null      float64
 3   Centuries  9 non-null      int64  
 4   Runs       9 non-null      int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4 KB


#### filters

In [78]:
# Include
players[players.index.isin(['Aus','Ind'])]

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
Ind,Tendulkar,200,53.78,51,15921
Aus,Ponting,168,51.85,41,13378
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


In [79]:
# Exclude 
players[~players.index.isin(['Aus','Ind'])]

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
WI,Sobers,93,57.78,26,8032
SriLanka,Sangakarra,134,57.4,38,12400
WI,Lara,131,52.88,34,11953
SA,Kallis,166,55.37,45,13289


#### List players from top 3 countries 

In [87]:
players.index.value_counts()

Ind         3
WI          2
Aus         2
SriLanka    1
SA          1
dtype: int64

In [88]:
players.index.value_counts().nlargest(3)

Ind    3
WI     2
Aus    2
dtype: int64

In [90]:
top3_countries = players.index.value_counts().nlargest(3).index
top3_countries

Index(['Ind', 'WI', 'Aus'], dtype='object')

In [92]:
players[players.index.isin(top3_countries)]

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
WI,Sobers,93,57.78,26,8032
Ind,Tendulkar,200,53.78,51,15921
WI,Lara,131,52.88,34,11953
Aus,Ponting,168,51.85,41,13378
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


#### Missing Values

In [116]:
df

Unnamed: 0_level_0,age,height_cm,weight_kg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avi,16,145.0,
Ravi,18,150.0,60.0
Kavi,20,,65.0
Tavi,24,160.0,


In [117]:
df.isna().sum()

age          0
height_cm    1
weight_kg    2
dtype: int64

In [118]:
df.isna().mean()

age          0.00
height_cm    0.25
weight_kg    0.50
dtype: float64

In [124]:
df.dropna(axis='columns', thresh=len(df)*.60) # 60% not missing i.e. atmax 40% missing 

Unnamed: 0_level_0,age,height_cm
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Avi,16,145.0
Ravi,18,150.0
Kavi,20,
Tavi,24,160.0


#### Split values into multiple columns

In [128]:
df = pd.DataFrame({'name':['Tendulkar, Sachin', 'Lara, Brian', 'Sobers, Garry'],
                 'centuries':[51, 34, 26]}, index=['Ind','WI','WI'])
df

Unnamed: 0,name,centuries
Ind,"Tendulkar, Sachin",51
WI,"Lara, Brian",34
WI,"Sobers, Garry",26


In [130]:
df.name.str.split(',', expand=True)

Unnamed: 0,0,1
Ind,Tendulkar,Sachin
WI,Lara,Brian
WI,Sobers,Garry


In [131]:
df[['Surname', 'Firstname']] = df.name.str.split(',', expand=True)

In [132]:
df

Unnamed: 0,name,centuries,Surname,Firstname
Ind,"Tendulkar, Sachin",51,Tendulkar,Sachin
WI,"Lara, Brian",34,Lara,Brian
WI,"Sobers, Garry",26,Sobers,Garry


#### Expand a list of values into Columns

In [133]:
df = pd.DataFrame({'A':[[160, 68], [158, 55],[172, 70]]})
df

Unnamed: 0,A
0,"[160, 68]"
1,"[158, 55]"
2,"[172, 70]"


In [134]:
df.A.apply(pd.Series)

Unnamed: 0,0,1
0,160,68
1,158,55
2,172,70


In [142]:
df.A.apply(pd.Series).add_prefix('Val_')

Unnamed: 0,Val_0,Val_1
0,160,68
1,158,55
2,172,70


In [144]:
pd.concat([df, df.A.apply(pd.Series).add_prefix('Val_')], axis='columns')

Unnamed: 0,A,Val_0,Val_1
0,"[160, 68]",160,68
1,"[158, 55]",158,55
2,"[172, 70]",172,70


In [None]:
del df['A']