# Dataframe and series data type

In [1]:
#dataframes with respect to native python can be viewed as a dictionary where keys are columns & values contain lists & are rows
people = {
    "first" : ['Shreya', 'Alex', 'Bob'],
    "last" : ['Walia', 'Doe', 'Ex'],
    "email" : ['sw@email.com', 'ad@email.com', 'be@email.com']
}

In [2]:
#to access emails key
#it returns list
people["email"]

['sw@email.com', 'ad@email.com', 'be@email.com']

In [3]:
#it is list datatype
type(people["email"])

list

In [4]:
import pandas as pd

In [5]:
#converting above dictionary to dataframe
#dataframes are similar to dictionaries of native python but have a lot more functionalitiies
df = pd.DataFrame(people)

In [6]:
df

Unnamed: 0,first,last,email
0,Shreya,Walia,sw@email.com
1,Alex,Doe,ad@email.com
2,Bob,Ex,be@email.com


In [7]:
#accessing email column #it is similar as dictionaries but the datatype of result will be different
df['email']

0    sw@email.com
1    ad@email.com
2    be@email.com
Name: email, dtype: object

In [8]:
#another way to access columns but prefer brackets as dot can be confused for calling another attribute or function that might
#have same name as the column and might lead to an error
df.email

0    sw@email.com
1    ad@email.com
2    be@email.com
Name: email, dtype: object

In [9]:
#the data type will be a series 
#series is like a list but have more functionality and is 1d
type(df['email'])

pandas.core.series.Series

In [10]:
#so a dataframe is a container for multiple series objects

In [11]:
#accessing multiple columns
#pass the columns as a list in inner brackets
#returns a filtered dataframe and not series anymore
#not using inner braces gives an error as pandas implies it as 2 string values as single col name
df[['last', 'email']]

Unnamed: 0,last,email
0,Walia,sw@email.com
1,Doe,ad@email.com
2,Ex,be@email.com


In [12]:
#to see all columns
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [13]:
#to get rows we use loc and iloc indexers
#iloc uses integer values for locating so i
#to get first row #use brackets as it is indexer
#when accessing a row the index is set to column names
df.iloc[0]

first          Shreya
last            Walia
email    sw@email.com
Name: 0, dtype: object

In [14]:
type(df.iloc[0])

pandas.core.series.Series

In [15]:
#selecting multiple rows
#returns filtered dataframe
df.iloc[[0,1]]

Unnamed: 0,first,last,email
0,Shreya,Walia,sw@email.com
1,Alex,Doe,ad@email.com


In [16]:
type(df.iloc[[0,1]])

pandas.core.frame.DataFrame

In [17]:
#loc & iloc can be used to access both rows and columns
#the rows can be passed as 1st arguement and col as 2nd
df.iloc[[0,1], 2]

0    sw@email.com
1    ad@email.com
Name: email, dtype: object

In [18]:
#in this case loc and iloc will have same commands
#loc makes use of labels to find the rows 
#here labels are integer values
df.loc[0]

first          Shreya
last            Walia
email    sw@email.com
Name: 0, dtype: object

In [19]:
df.loc[[0,1]]

Unnamed: 0,first,last,email
0,Shreya,Walia,sw@email.com
1,Alex,Doe,ad@email.com


In [20]:
#as columns have names or labels we can use those to locate the values
df.loc[[0,1], 'email']

0    sw@email.com
1    ad@email.com
Name: email, dtype: object

In [21]:
df.loc[[0,1], ['email', 'last']]

Unnamed: 0,email,last
0,sw@email.com,Walia
1,ad@email.com,Doe


# setting custom indexes

In [22]:
#col with name has integer identifier as indexes for rows
#to have labels for rows for naming them #labels are usually unique names given to rows
#for this file snippet we will be setting email as our label index
#indexes give nice unique identifiers for rows and act as labels for rows
df.set_index('email')

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
sw@email.com,Shreya,Walia
ad@email.com,Alex,Doe
be@email.com,Bob,Ex


In [23]:
#dataframe remains same as it is not inplace
df

Unnamed: 0,first,last,email
0,Shreya,Walia,sw@email.com
1,Alex,Doe,ad@email.com
2,Bob,Ex,be@email.com


In [24]:
#to set index permanently
df.set_index('email', inplace = True)

In [25]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
sw@email.com,Shreya,Walia
ad@email.com,Alex,Doe
be@email.com,Bob,Ex


In [26]:
#to get index and its values
df.index

Index(['sw@email.com', 'ad@email.com', 'be@email.com'], dtype='object', name='email')

In [27]:
#we can access rows now by
#returns row for the specific index
df.loc['sw@email.com']

first    Shreya
last      Walia
Name: sw@email.com, dtype: object

In [28]:
df.loc['sw@email.com', 'last']

'Walia'

In [29]:
#df.loc[0]
#will give error as labels are changed

In [30]:
df.iloc[0]
#will work as it is integer value for the label

first    Shreya
last      Walia
Name: sw@email.com, dtype: object

In [31]:
#to reset indexes
df.reset_index()

Unnamed: 0,email,first,last
0,sw@email.com,Shreya,Walia
1,ad@email.com,Alex,Doe
2,be@email.com,Bob,Ex


In [32]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
sw@email.com,Shreya,Walia
ad@email.com,Alex,Doe
be@email.com,Bob,Ex


In [33]:
#to save changes inplace
df.reset_index(inplace = True)

In [34]:
df

Unnamed: 0,email,first,last
0,sw@email.com,Shreya,Walia
1,ad@email.com,Alex,Doe
2,be@email.com,Bob,Ex
