# Pandas

#### Installing Pandas using pip

In [3]:
# ! mark runs the command in terminal.
# we are installing pandas library here by using pip.
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# import 
import pandas as pd

#### Creating an DataFrame

In [2]:
# dataframe with no columns and no rows
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [3]:
# dataframe with columns
df = pd.DataFrame(columns=['Col1', 'Col2', 'Col3'])
print(df)

Empty DataFrame
Columns: [Col1, Col2, Col3]
Index: []


In [5]:
# dataframe with columns and respective values.
data = {'Col1': [1, 2, 3],
        'Col2': ['Raj', 'Sameera', 'Ketan'],
        'Col3': [20, 25, 22]}
df = pd.DataFrame(data=data)
print(df)

   Col1     Col2  Col3
0     1      Raj    20
1     2  Sameera    25
2     3    Ketan    22


In [6]:
type(df)

pandas.core.frame.DataFrame

In [7]:
df['Col1']

0    1
1    2
2    3
Name: Col1, dtype: int64

In [8]:
type(df['Col1'])

pandas.core.series.Series

#### Datatypes in pandas

In [9]:
df.dtypes

Col1     int64
Col2    object
Col3     int64
dtype: object

#### Modifying data

In [10]:
# when the right side is a static value, it applies to all rows.
df['Newcol'] = 1
df.head()

Unnamed: 0,Col1,Col2,Col3,Newcol
0,1,Raj,20,1
1,2,Sameera,25,1
2,3,Ketan,22,1


In [11]:
# when the right side is not static value, make sure that the shapes match
# the lenght of list on right should be same as number of rows.
df['OneMoreCol'] = [1, 2, 3]
df.head()

Unnamed: 0,Col1,Col2,Col3,Newcol,OneMoreCol
0,1,Raj,20,1,1
1,2,Sameera,25,1,2
2,3,Ketan,22,1,3


In [12]:
# what happens when the lenghts are not same.
df['OneMoreCol'] = [1, 2, 3, 4]
df.head()

ValueError: Length of values (4) does not match length of index (3)

#### Check size of dataframe

In [13]:
df.shape
# first value accounts to number of rows
# second value accounts to number of columns

(3, 5)

#### Reading a csv file

In [14]:
# write a program, which accepts the path of csv file and return a pandas dataframe
def read_file_frm_source(pth):
    return pd.read_csv(pth)

In [15]:
# note \ is an escape character and we will have to replace them with / or \\
ibm_hr = read_file_frm_source("data/ibm_hr.csv")

In [16]:
type(ibm_hr)

pandas.core.frame.DataFrame

In [17]:
# shape of dataframe
# shape shows how many records and columns are present in dataframe.
ibm_hr.shape

(1470, 35)

In [18]:
# dataframe top 5 records
ibm_hr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [19]:
# indexes of dataframe
# name of record
ibm_hr.index

RangeIndex(start=0, stop=1470, step=1)

In [28]:
[x for x in range(0, 10, 1)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [20]:
# index need not be just continuous number.
import random
ibm_hr.index = [random.randint(1, 10000) for i in range(ibm_hr.shape[0])]
ibm_hr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1049,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
6368,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
1961,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
522,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
9305,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [24]:
# we can also set one of the column as index
# index can also be a categorical datatype.
ibm_hr2 = ibm_hr.set_index('EducationField', drop=True)
ibm_hr2.head()

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,41,Yes,Travel_Rarely,1102,Sales,1,2,1,1,2,...,1,80,0,8,0,1,6,4,0,5
Life Sciences,49,No,Travel_Frequently,279,Research & Development,8,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Other,37,Yes,Travel_Rarely,1373,Research & Development,2,2,1,4,4,...,2,80,0,7,3,3,0,0,0,0
Life Sciences,33,No,Travel_Frequently,1392,Research & Development,3,4,1,5,4,...,3,80,0,8,3,3,8,7,3,0
Medical,27,No,Travel_Rarely,591,Research & Development,2,1,1,7,1,...,4,80,1,6,3,3,2,2,2,2


In [26]:
# name of column
ibm_hr.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [27]:
# show the datatypes of each column within dataframe
ibm_hr.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

#### Subsetting of data

In [28]:
# usage of .loc to subset matching rows.
ibm_hr2.loc['Life Sciences']

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,41,Yes,Travel_Rarely,1102,Sales,1,2,1,1,2,...,1,80,0,8,0,1,6,4,0,5
Life Sciences,49,No,Travel_Frequently,279,Research & Development,8,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Life Sciences,33,No,Travel_Frequently,1392,Research & Development,3,4,1,5,4,...,3,80,0,8,3,3,8,7,3,0
Life Sciences,32,No,Travel_Frequently,1005,Research & Development,2,2,1,8,4,...,3,80,0,8,2,2,7,7,3,6
Life Sciences,30,No,Travel_Rarely,1358,Research & Development,24,1,1,11,4,...,2,80,1,1,2,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Life Sciences,45,No,Travel_Rarely,374,Sales,20,3,1,2046,4,...,3,80,0,8,3,3,5,3,0,1
Life Sciences,40,No,Travel_Rarely,1322,Research & Development,2,4,1,2048,3,...,4,80,0,8,2,3,2,2,2,2
Life Sciences,35,No,Travel_Frequently,1199,Research & Development,18,4,1,2049,3,...,4,80,2,10,2,4,10,2,0,2
Life Sciences,35,No,Travel_Rarely,287,Research & Development,1,4,1,2052,3,...,4,80,1,4,5,3,4,3,1,1


In [30]:
ibm_hr2[ibm_hr2['Department']=='Sales']

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,41,Yes,Travel_Rarely,1102,Sales,1,2,1,1,2,...,1,80,0,8,0,1,6,4,0,5
Life Sciences,53,No,Travel_Rarely,1219,Sales,2,4,1,23,1,...,3,80,0,31,3,3,25,8,3,7
Life Sciences,36,Yes,Travel_Rarely,1218,Sales,9,4,1,27,3,...,2,80,0,10,4,3,5,3,0,3
Marketing,42,No,Travel_Rarely,691,Sales,8,4,1,35,3,...,4,80,1,10,2,3,9,7,4,2
Marketing,46,No,Travel_Rarely,705,Sales,2,4,1,38,2,...,4,80,0,22,2,2,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Life Sciences,45,No,Travel_Rarely,374,Sales,20,3,1,2046,4,...,3,80,0,8,3,3,5,3,0,1
Marketing,50,Yes,Travel_Rarely,410,Sales,28,3,1,2055,4,...,2,80,1,20,3,3,3,2,2,0
Marketing,39,No,Travel_Rarely,722,Sales,24,1,1,2056,2,...,1,80,1,21,2,2,20,9,9,6
Other,26,No,Travel_Rarely,1167,Sales,5,3,1,2060,4,...,4,80,0,5,2,3,4,2,0,0


In [32]:
# we could also subset multiple indexes using .loc
ibm_hr2.loc[['Life Sciences', 'Marketing']]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,41,Yes,Travel_Rarely,1102,Sales,1,2,1,1,2,...,1,80,0,8,0,1,6,4,0,5
Life Sciences,49,No,Travel_Frequently,279,Research & Development,8,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Life Sciences,33,No,Travel_Frequently,1392,Research & Development,3,4,1,5,4,...,3,80,0,8,3,3,8,7,3,0
Life Sciences,32,No,Travel_Frequently,1005,Research & Development,2,2,1,8,4,...,3,80,0,8,2,2,7,7,3,6
Life Sciences,30,No,Travel_Rarely,1358,Research & Development,24,1,1,11,4,...,2,80,1,1,2,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Marketing,34,No,Travel_Rarely,704,Sales,28,3,1,2035,4,...,4,80,2,8,2,3,8,7,1,7
Marketing,36,No,Non-Travel,301,Sales,15,4,1,2036,4,...,1,80,1,15,4,2,15,12,11,11
Marketing,36,No,Travel_Rarely,1120,Sales,11,4,1,2045,2,...,1,80,1,8,2,2,6,3,0,0
Marketing,50,Yes,Travel_Rarely,410,Sales,28,3,1,2055,4,...,2,80,1,20,3,3,3,2,2,0


In [33]:
# above were the cases where all the columns were getting selected.
# notice that .loc actually takes both indexes and columns for subset.
ibm_hr2.loc[['Life Sciences', 'Marketing'], ['Age', 'Attrition']]


Unnamed: 0_level_0,Age,Attrition
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1
Life Sciences,41,Yes
Life Sciences,49,No
Life Sciences,33,No
Life Sciences,32,No
Life Sciences,30,No
...,...,...
Marketing,34,No
Marketing,36,No
Marketing,36,No
Marketing,50,Yes


In [34]:
# select all rows and columns using .loc
ibm_hr2.loc[:, :]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,41,Yes,Travel_Rarely,1102,Sales,1,2,1,1,2,...,1,80,0,8,0,1,6,4,0,5
Life Sciences,49,No,Travel_Frequently,279,Research & Development,8,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Other,37,Yes,Travel_Rarely,1373,Research & Development,2,2,1,4,4,...,2,80,0,7,3,3,0,0,0,0
Life Sciences,33,No,Travel_Frequently,1392,Research & Development,3,4,1,5,4,...,3,80,0,8,3,3,8,7,3,0
Medical,27,No,Travel_Rarely,591,Research & Development,2,1,1,7,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Medical,36,No,Travel_Frequently,884,Research & Development,23,2,1,2061,3,...,3,80,1,17,3,3,5,2,0,3
Medical,39,No,Travel_Rarely,613,Research & Development,6,1,1,2062,4,...,1,80,1,9,5,3,7,7,1,7
Life Sciences,27,No,Travel_Rarely,155,Research & Development,4,3,1,2064,2,...,2,80,1,6,0,3,6,2,0,3
Medical,49,No,Travel_Frequently,1023,Sales,2,3,1,2065,4,...,4,80,0,17,3,2,9,6,0,8


In [52]:
# let us view it in a simple example
temp_df = pd.DataFrame({
    'animal' : ['cat', 'dog', 'sheep', 'goat', 'cow'],
    'count'  : ['3', '5', '10', '15', '10'],
    'location'  : [ 'home',  'home',  'farm',  'farm',  'farm'],
    'id'     : ['I101', 'I102', 'I103', 'I104', 'I105']
})
temp_df

Unnamed: 0,animal,count,location,id
0,cat,3,home,I101
1,dog,5,home,I102
2,sheep,10,farm,I103
3,goat,15,farm,I104
4,cow,10,farm,I105


In [53]:
# all columns all rows
temp_df.loc[:, :]

Unnamed: 0,animal,count,location,id
0,cat,3,home,I101
1,dog,5,home,I102
2,sheep,10,farm,I103
3,goat,15,farm,I104
4,cow,10,farm,I105


In [54]:
# index 2 and 4 rows with all columns
temp_df.loc[[2, 4]]

Unnamed: 0,animal,count,location,id
2,sheep,10,farm,I103
4,cow,10,farm,I105


In [55]:
# index 1 and 3 rows with few columns
temp_df.loc[[1, 3], ['animal', 'id']]

Unnamed: 0,animal,id
1,dog,I102
3,goat,I104


In [56]:
# we could also use slicing as below
temp_df.loc[2:4]

Unnamed: 0,animal,count,location,id
2,sheep,10,farm,I103
3,goat,15,farm,I104
4,cow,10,farm,I105


In [57]:
# could also use iloc
# notice how it does not fetch same number of records.
# .iloc excludes 4 in this case.
temp_df.iloc[2:4]

Unnamed: 0,animal,count,location,id
2,sheep,10,farm,I103
3,goat,15,farm,I104


In [58]:
# important note -- notice the difference between loc and iloc in below scenario
# let us sort the dataframe based on animal
temp_df.sort_values(by='animal', ascending=True, inplace=True)
temp_df.head()

Unnamed: 0,animal,count,location,id
0,cat,3,home,I101
4,cow,10,farm,I105
1,dog,5,home,I102
3,goat,15,farm,I104
2,sheep,10,farm,I103


In [59]:
# this still fetches the same set of records even after sorting.
# loc actually looks for values 0 to 4 in index and fetches them.
# once the 4th record is fetched, it  does not fetch the missing records.
temp_df.loc[0:4]

Unnamed: 0,animal,count,location,id
0,cat,3,home,I101
4,cow,10,farm,I105


In [60]:
temp_df.loc[1:2]

Unnamed: 0,animal,count,location,id
1,dog,5,home,I102
3,goat,15,farm,I104
2,sheep,10,farm,I103


In [61]:
# .iloc specifically locates the actual record irrestive of index
# notice how 2nd record (row=2) and 3rd row (row=3) are picked though index is something else.
temp_df.iloc[1:3]

Unnamed: 0,animal,count,location,id
4,cow,10,farm,I105
1,dog,5,home,I102


In [62]:
# set id as index
temp_df.set_index('id', inplace=True, drop=True)
temp_df

Unnamed: 0_level_0,animal,count,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I101,cat,3,home
I105,cow,10,farm
I102,dog,5,home
I104,goat,15,farm
I103,sheep,10,farm


In [64]:
# notice how .iloc still fetches records 2, and 3
temp_df.iloc[1:3]

Unnamed: 0_level_0,animal,count,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I105,cow,10,farm
I102,dog,5,home


In [65]:
# as .loc depends on index we will have to use only those values are available in index.
# for ex. the below code fails.
temp_df.loc[2:4]

TypeError: cannot do slice indexing on Index with these indexers [2] of type int

In [70]:
temp_df.loc['I101':'I102']

Unnamed: 0_level_0,animal,count,location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
I101,cat,3,home
I105,cow,10,farm
I102,dog,5,home


In [72]:
# we can reset index and set records from 0 again.
ibm_hr.reset_index(drop=True, inplace=True)
ibm_hr.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


#### subsetting based on position

In [73]:
# fetch first 10 records
ibm_hr[:10]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,...,2,80,1,1,2,3,1,0,0,0
8,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,...,2,80,0,10,2,3,9,7,1,8
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,...,2,80,2,17,3,2,7,7,7,7


In [75]:
ibm_hr.iloc[[1, 5, 7]]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,...,2,80,1,1,2,3,1,0,0,0


In [76]:
# keep in mind that .iloc does not consider index
ibm_hr2.iloc[[1, 5, 7]]

Unnamed: 0_level_0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Life Sciences,49,No,Travel_Frequently,279,Research & Development,8,1,1,2,3,...,4,80,1,10,3,3,10,7,1,7
Life Sciences,32,No,Travel_Frequently,1005,Research & Development,2,2,1,8,4,...,3,80,0,8,2,2,7,7,3,6
Life Sciences,30,No,Travel_Rarely,1358,Research & Development,24,1,1,11,4,...,2,80,1,1,2,3,1,0,0,0


In [77]:
# same goes with selecting columns using .iloc
ibm_hr.iloc[[1,4,5,2],[1,3,5]]

Unnamed: 0,Attrition,DailyRate,DistanceFromHome
1,No,279,8
4,No,591,2
5,No,1005,2
2,Yes,1373,2


In [78]:
ibm_hr2.iloc[[1,4,5,2],[1,3,5]]

Unnamed: 0_level_0,Attrition,DailyRate,DistanceFromHome
EducationField,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Life Sciences,No,279,8
Medical,No,591,2
Life Sciences,No,1005,2
Other,Yes,1373,2


In [32]:
ibm_hr['Age'][0]

41.0

In [33]:
ibm_hr['Age'][1]

49.0

In [34]:
ibm_hr['Age'][3]

nan

In [35]:
# capture null values in dataframe
ibm_hr.isnull().sum()

Age                         1
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

#### subsetting based on condition (and or)