In [1]:
import numpy as np

In [2]:
import pandas as pd

### In this tutorial we cover
* Create a DataFrame
* Access rows, columns, .loc(), .iloc(),
* reset_index(), rename(), set_index(), append(),
* drop(), sort_values(), 


### Pandas - DataFrame

A Data frame is a two-dimensional data structure, i.e., data is aligned in a tabular fashion in rows and columns


#### Create a DataFrame 

In [4]:
data = {'Player': ['Bradman', 'Sobers', 'Sangakarra', 'Tendulkar', 'Lara', 'Kallis', 'Ponting','Gavaskar','Dravid'],
        'Matches':[52, 93, 134, 200, 131, 166, 168, 125, 164],
        'Bat_Avg':[99.94, 57.78, 57.40, 53.78, 52.88, 55.37, 51.85, 51.12, 52.31],
        'Centuries':[29, 26, 38, 51, 34, 45, 41, 34, 36 ],
        'Runs':[6996, 8032, 12400, 15921, 11953, 13289, 13378, 10122, 13288]
       }

indx = ['Aus', 'WI', 'SriLanka', 'Ind', 'WI', 'SA', 'Aus', 'Ind', 'Ind']

df = pd.DataFrame(data = data, index=indx)
df

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
WI,Sobers,93,57.78,26,8032
SriLanka,Sangakarra,134,57.4,38,12400
Ind,Tendulkar,200,53.78,51,15921
WI,Lara,131,52.88,34,11953
SA,Kallis,166,55.37,45,13289
Aus,Ponting,168,51.85,41,13378
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


* Column labels : name of columns
* Row labels : index of rows

#### Column Labels

In [11]:
df.columns

Index(['Player', 'Matches', 'Bat_Avg', 'Centuries', 'Runs'], dtype='object')

#### Row Labels

In [12]:
df.index

Index(['Aus', 'WI', 'SriLanka', 'Ind', 'WI', 'SA', 'Aus', 'Ind', 'Ind'], dtype='object')

<b>Shape</b>: No. of Rows and Columns

In [14]:
df.shape

(9, 5)

#### .memory_usage() : 
It returns a Series with the column names as labels and the memory usage in bytes as data values

In [37]:
df.memory_usage()

Index        392
Player        72
Matches       72
Bat_Avg       72
Centuries     72
Runs          72
dtype: int64

<b>head()</b>: first n records

In [17]:
df.head(n=3)

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
WI,Sobers,93,57.78,26,8032
SriLanka,Sangakarra,134,57.4,38,12400


<b>tail()</b> : last n records

In [18]:
df.tail(n=3)

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Ponting,168,51.85,41,13378
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


<b> datatypes of colums </b>

In [24]:
df.dtypes

Player        object
Matches        int64
Bat_Avg      float64
Centuries      int64
Runs           int64
dtype: object

#### describe() : statistics related to columns

In [25]:
df.describe()

Unnamed: 0,Matches,Bat_Avg,Centuries,Runs
count,9.0,9.0,9.0,9.0
mean,137.0,59.158889,37.111111,11708.777778
std,44.511235,15.474808,7.753136,2835.14615
min,52.0,51.12,26.0,6996.0
25%,125.0,52.31,34.0,10122.0
50%,134.0,53.78,36.0,12400.0
75%,166.0,57.4,41.0,13289.0
max,200.0,99.94,51.0,15921.0


<b> Access individual column using dot notation </b>

In [19]:
df.Player

Aus            Bradman
WI              Sobers
SriLanka    Sangakarra
Ind          Tendulkar
WI                Lara
SA              Kallis
Aus            Ponting
Ind           Gavaskar
Ind             Dravid
Name: Player, dtype: object

#### Access individial column using square bracket 

In [20]:
df['Player']

Aus            Bradman
WI              Sobers
SriLanka    Sangakarra
Ind          Tendulkar
WI                Lara
SA              Kallis
Aus            Ponting
Ind           Gavaskar
Ind             Dravid
Name: Player, dtype: object

#### Access methods: .loc[] and .iloc[]

* .loc() : allows to access via labels
* .iloc() : allows to access via index numbers

In [26]:
# Select All Records of Ind using .loc

df.loc['Ind']

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Ind,Tendulkar,200,53.78,51,15921
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


In [27]:
# Select All records of Ind and Aus

df.loc[['Aus', 'Ind']]

Unnamed: 0,Player,Matches,Bat_Avg,Centuries,Runs
Aus,Bradman,52,99.94,29,6996
Aus,Ponting,168,51.85,41,13378
Ind,Tendulkar,200,53.78,51,15921
Ind,Gavaskar,125,51.12,34,10122
Ind,Dravid,164,52.31,36,13288


In [29]:
# Select specific columns : Player name and No. of Centuries

df.loc[:, ['Player', 'Centuries']]

Unnamed: 0,Player,Centuries
Aus,Bradman,29
WI,Sobers,26
SriLanka,Sangakarra,38
Ind,Tendulkar,51
WI,Lara,34
SA,Kallis,45
Aus,Ponting,41
Ind,Gavaskar,34
Ind,Dravid,36


In [30]:
# Select specific rows and columns

df.loc[['Aus', 'Ind'], ['Player', 'Centuries', 'Bat_Avg']]

Unnamed: 0,Player,Centuries,Bat_Avg
Aus,Bradman,29,99.94
Aus,Ponting,41,51.85
Ind,Tendulkar,51,53.78
Ind,Gavaskar,34,51.12
Ind,Dravid,36,52.31


#### Using iloc[]

In [31]:
# select first 3 rows and first 4 coulmns

df.iloc[0:3, 0:4]

Unnamed: 0,Player,Matches,Bat_Avg,Centuries
Aus,Bradman,52,99.94,29
WI,Sobers,93,57.78,26
SriLanka,Sangakarra,134,57.4,38


#### Extract data from DataFrame as numpy array

#### to_numpy() : Create 2 D array from DataFrame

In [33]:
arr = df.to_numpy()
arr.shape

(9, 5)

In [34]:
arr

array([['Bradman', 52, 99.94, 29, 6996],
       ['Sobers', 93, 57.78, 26, 8032],
       ['Sangakarra', 134, 57.4, 38, 12400],
       ['Tendulkar', 200, 53.78, 51, 15921],
       ['Lara', 131, 52.88, 34, 11953],
       ['Kallis', 166, 55.37, 45, 13289],
       ['Ponting', 168, 51.85, 41, 13378],
       ['Gavaskar', 125, 51.12, 34, 10122],
       ['Dravid', 164, 52.31, 36, 13288]], dtype=object)

#### .values : to create a 2D array from DataFrame

In [35]:
df.values

array([['Bradman', 52, 99.94, 29, 6996],
       ['Sobers', 93, 57.78, 26, 8032],
       ['Sangakarra', 134, 57.4, 38, 12400],
       ['Tendulkar', 200, 53.78, 51, 15921],
       ['Lara', 131, 52.88, 34, 11953],
       ['Kallis', 166, 55.37, 45, 13289],
       ['Ponting', 168, 51.85, 41, 13378],
       ['Gavaskar', 125, 51.12, 34, 10122],
       ['Dravid', 164, 52.31, 36, 13288]], dtype=object)

#### Copy DataFrame

In [10]:
players = df.copy(deep=True)

In [11]:
id(players) == id(df)

False

#### reset_index()

In [12]:
players.reset_index().head()

Unnamed: 0,index,Player,Matches,Bat_Avg,Centuries,Runs
0,Aus,Bradman,52,99.94,29,6996
1,WI,Sobers,93,57.78,26,8032
2,SriLanka,Sangakarra,134,57.4,38,12400
3,Ind,Tendulkar,200,53.78,51,15921
4,WI,Lara,131,52.88,34,11953


In [13]:
# to make changes within DataFrame 
players.reset_index(inplace=True)

In [14]:
players.head()

Unnamed: 0,index,Player,Matches,Bat_Avg,Centuries,Runs
0,Aus,Bradman,52,99.94,29,6996
1,WI,Sobers,93,57.78,26,8032
2,SriLanka,Sangakarra,134,57.4,38,12400
3,Ind,Tendulkar,200,53.78,51,15921
4,WI,Lara,131,52.88,34,11953


#### rename column

In [15]:
players.rename(columns={'index':'Country'}, inplace=True)

In [16]:
players.head()

Unnamed: 0,Country,Player,Matches,Bat_Avg,Centuries,Runs
0,Aus,Bradman,52,99.94,29,6996
1,WI,Sobers,93,57.78,26,8032
2,SriLanka,Sangakarra,134,57.4,38,12400
3,Ind,Tendulkar,200,53.78,51,15921
4,WI,Lara,131,52.88,34,11953


#### set_index()

In [17]:
players.set_index('Player', inplace=True)

In [18]:
players.head()

Unnamed: 0_level_0,Country,Matches,Bat_Avg,Centuries,Runs
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bradman,Aus,52,99.94,29,6996
Sobers,WI,93,57.78,26,8032
Sangakarra,SriLanka,134,57.4,38,12400
Tendulkar,Ind,200,53.78,51,15921
Lara,WI,131,52.88,34,11953


#### Data Modification

In [19]:
# Modify Srilanka Country Code as SL

players.loc[players['Country'] == 'SriLanka', 'Country'] ='SL'

In [20]:
players.loc[['Sangakarra']]

Unnamed: 0_level_0,Country,Matches,Bat_Avg,Centuries,Runs
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sangakarra,SL,134,57.4,38,12400


#### Data Insert

In [21]:
new_data = pd.DataFrame(data = {'Country': ['Pak', 'Aus'],
                                'Matches': [124 , 156 ],
                                'Bat_Avg':[52.57, 50.56],
                                'Centuries':[23, 27],
                                'Runs':[8832, 11174]},
                        index = ['Miandad','Border'])
new_data               

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Runs
Miandad,Pak,124,52.57,23,8832
Border,Aus,156,50.56,27,11174


In [22]:
players = players.append(new_data)

In [23]:
players

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Runs
Bradman,Aus,52,99.94,29,6996
Sobers,WI,93,57.78,26,8032
Sangakarra,SL,134,57.4,38,12400
Tendulkar,Ind,200,53.78,51,15921
Lara,WI,131,52.88,34,11953
Kallis,SA,166,55.37,45,13289
Ponting,Aus,168,51.85,41,13378
Gavaskar,Ind,125,51.12,34,10122
Dravid,Ind,164,52.31,36,13288
Miandad,Pak,124,52.57,23,8832


#### Remove/ drop

In [24]:
# Find Player who has scored less than 10,000 and 25 centuries

players[(players.Runs < 10000) & (players.Centuries < 25)]

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Runs
Miandad,Pak,124,52.57,23,8832


In [25]:
players.drop('Miandad', inplace=True)

In [26]:
players

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Runs
Bradman,Aus,52,99.94,29,6996
Sobers,WI,93,57.78,26,8032
Sangakarra,SL,134,57.4,38,12400
Tendulkar,Ind,200,53.78,51,15921
Lara,WI,131,52.88,34,11953
Kallis,SA,166,55.37,45,13289
Ponting,Aus,168,51.85,41,13378
Gavaskar,Ind,125,51.12,34,10122
Dravid,Ind,164,52.31,36,13288
Border,Aus,156,50.56,27,11174


#### Insert a new column

In [27]:
players.insert(loc=4, column='Highest_Score', value=np.array([334, 365, 319, 248, 400, 224, 257, 236, 270, 205]))

In [28]:
players.insert(loc=6, column='Ducks', value=np.array([7, 12, 11, 14, 17, 16, 17, 12, 8, 11]))

In [29]:
players.head()

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs,Ducks
Bradman,Aus,52,99.94,29,334,6996,7
Sobers,WI,93,57.78,26,365,8032,12
Sangakarra,SL,134,57.4,38,319,12400,11
Tendulkar,Ind,200,53.78,51,248,15921,14
Lara,WI,131,52.88,34,400,11953,17


In [36]:
players.reset_index(inplace=True)

In [45]:
players.rename(columns={'index':'Name'},inplace=True)

In [46]:
players.to_csv('players.csv', index=False)

### Drop a column

In [102]:
del players['Ducks']

In [103]:
players.head()

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs
Bradman,Aus,52,99.94,29,334,6996
Sobers,WI,93,57.78,26,365,8032
Sangakarra,SL,134,57.4,38,319,12400
Tendulkar,Ind,200,53.78,51,248,15921
Lara,WI,131,52.88,34,400,11953


#### sort_values(): Sort DataFrame

In [105]:
players.sort_values(by=['Runs'], ascending=False).head(3)

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs
Tendulkar,Ind,200,53.78,51,248,15921
Ponting,Aus,168,51.85,41,257,13378
Kallis,SA,166,55.37,45,224,13289


In [109]:
players.sort_values(by=['Centuries', 'Runs'], ascending=[False, False]).head(3)

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs
Tendulkar,Ind,200,53.78,51,248,15921
Kallis,SA,166,55.37,45,224,13289
Ponting,Aus,168,51.85,41,257,13378


### Filter Data

In [110]:
# Players who played more than 150 matches

players.loc[players['Matches'] > 150]

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs
Tendulkar,Ind,200,53.78,51,248,15921
Kallis,SA,166,55.37,45,224,13289
Ponting,Aus,168,51.85,41,257,13378
Dravid,Ind,164,52.31,36,270,13288
Border,Aus,156,50.56,27,205,11174


In [111]:
# Players who played more than 150 matches and made atleast 35 centuries

players[(players['Matches'] > 150 ) & (players['Centuries'] >=35)]

Unnamed: 0,Country,Matches,Bat_Avg,Centuries,Highest_Score,Runs
Tendulkar,Ind,200,53.78,51,248,15921
Kallis,SA,166,55.37,45,224,13289
Ponting,Aus,168,51.85,41,257,13378
Dravid,Ind,164,52.31,36,270,13288
