In [1]:
import pandas as pd

Creating DataFrames

In [2]:
data = {
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}
df = pd.DataFrame(data)
print(df)

    Name  Age      City
0  Alice   25  New York
1  Kumar   40    Mumbai
2    Mei   35  Shanghai


In [3]:
data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Kumar', 'Age': 40, 'City': 'Mumbai'},
    {'Name': 'Mei', 'Age': 35, 'City': 'Shanghai'}
]
df = pd.DataFrame(data)
print(df)

    Name  Age      City
0  Alice   25  New York
1  Kumar   40    Mumbai
2    Mei   35  Shanghai


In [4]:
import numpy as np

arr = np.arange(1, 10).reshape(3, 3)
df = pd.DataFrame(arr, columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


Setting the Index

In [5]:
data = {
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}
df = pd.DataFrame(data, index=['a', 'b', 'c'])
print(df)

    Name  Age      City
a  Alice   25  New York
b  Kumar   40    Mumbai
c    Mei   35  Shanghai


In [6]:
df2 = df.set_index('Name')
print(df2)

       Age      City
Name                
Alice   25  New York
Kumar   40    Mumbai
Mei     35  Shanghai


In [7]:
df2_reset = df2.reset_index()
print(df2_reset)

    Name  Age      City
0  Alice   25  New York
1  Kumar   40    Mumbai
2    Mei   35  Shanghai


Attributes

In [8]:
df = pd.DataFrame({
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}, index=['a', 'b', 'c'])

df.shape

(3, 3)

In [9]:
df.memory_usage(deep=True)

Index    174
Name     184
Age       24
City     193
dtype: int64

In [10]:
df.index

Index(['a', 'b', 'c'], dtype='object')

In [11]:
df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [12]:
df.dtypes

Name    object
Age      int64
City    object
dtype: object

In [13]:
df.values

array([['Alice', 25, 'New York'],
       ['Kumar', 40, 'Mumbai'],
       ['Mei', 35, 'Shanghai']], dtype=object)

Data Types

In [14]:
df['City'] = df['City'].astype('category')
df.dtypes

Name      object
Age        int64
City    category
dtype: object

Indexing and Slicing

Label-based Access

In [15]:
df = pd.DataFrame({
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}, index=['a', 'b', 'c'])

df.loc['a', 'Name']

'Alice'

In [16]:
df.loc['a']  # Entire row with label 'a'

Name       Alice
Age           25
City    New York
Name: a, dtype: object

In [17]:
df.loc[:, 'City']  # Entire 'City' column

a    New York
b      Mumbai
c    Shanghai
Name: City, dtype: object

In [18]:
df['City']

a    New York
b      Mumbai
c    Shanghai
Name: City, dtype: object

In [19]:
print(df.loc['a':'b', 'Name':'Age'])

    Name  Age
a  Alice   25
b  Kumar   40


Position-based Access

In [20]:
df.iloc[0, 0]  # Element at row 0, column 0

'Alice'

In [21]:
df.iloc[1]  # Entire second row

Name     Kumar
Age         40
City    Mumbai
Name: b, dtype: object

In [22]:
df.iloc[:, 1]  # Entire second column (Age)

a    25
b    40
c    35
Name: Age, dtype: int64

In [23]:
print(df.iloc[1:, 1:])  # Sub-DataFrame of bottom-right corner

   Age      City
b   40    Mumbai
c   35  Shanghai


Sequential Access

In [24]:
df['Age'].iloc[0]  # First element of 'Age' column

25

Boolean Indexing

In [25]:
print(df[df['Age'] > 25])

    Name  Age      City
b  Kumar   40    Mumbai
c    Mei   35  Shanghai


In [26]:
print(df[(df['Age'] > 30) & (df['City'] == 'Mumbai')])

    Name  Age    City
b  Kumar   40  Mumbai


In [27]:
print(df[df['City'].isin(['New York', 'Shanghai'])])

    Name  Age      City
a  Alice   25  New York
c    Mei   35  Shanghai


Fancy Indexing

In [28]:
print(df.loc[['a', 'c'], ['Name', 'City']])

    Name      City
a  Alice  New York
c    Mei  Shanghai


Changing Values

In [29]:
df.loc['a', 'City'] = 'Boston'
print(df)

    Name  Age      City
a  Alice   25    Boston
b  Kumar   40    Mumbai
c    Mei   35  Shanghai


In [30]:
df['Age'] = df['Age'] + 1
print(df)

    Name  Age      City
a  Alice   26    Boston
b  Kumar   41    Mumbai
c    Mei   36  Shanghai


In [31]:
df['Country'] = ['USA', 'India', 'China']
print(df)

    Name  Age      City Country
a  Alice   26    Boston     USA
b  Kumar   41    Mumbai   India
c    Mei   36  Shanghai   China


Vectorized Operations

In [32]:
df['DoubleAge'] = df['Age'] * 2
print(df)

    Name  Age      City Country  DoubleAge
a  Alice   26    Boston     USA         52
b  Kumar   41    Mumbai   India         82
c    Mei   36  Shanghai   China         72


In [33]:
import numpy as np

df['LogAge'] = np.log(df['Age'])
print(df)

    Name  Age      City Country  DoubleAge    LogAge
a  Alice   26    Boston     USA         52  3.258097
b  Kumar   41    Mumbai   India         82  3.713572
c    Mei   36  Shanghai   China         72  3.583519


String Methods

In [34]:
df = pd.DataFrame({
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}, index=['a', 'b', 'c'])

# Convert to uppercase
df['NameUpper'] = df['Name'].str.upper()

# Extract first character
df['FirstChar'] = df['Name'].str[0]

# Replace characters
df['CityRenamed'] = df['City'].str.replace('a', '@')
print(df)

    Name  Age      City NameUpper FirstChar CityRenamed
a  Alice   25  New York     ALICE         A    New York
b  Kumar   40    Mumbai     KUMAR         K      Mumb@i
c    Mei   35  Shanghai       MEI         M    Sh@ngh@i


Deleting Rows and Columns

In [35]:
df = df.drop('b')
print(df)

    Name  Age      City NameUpper FirstChar CityRenamed
a  Alice   25  New York     ALICE         A    New York
c    Mei   35  Shanghai       MEI         M    Sh@ngh@i


In [36]:
df = df.drop(['NameUpper', 'FirstChar', 'CityRenamed'], axis=1)
print(df)

    Name  Age      City
a  Alice   25  New York
c    Mei   35  Shanghai


Renaming Rows and Columns

In [37]:
df = df.rename(columns={'City': 'Location'})
print(df)

    Name  Age  Location
a  Alice   25  New York
c    Mei   35  Shanghai


In [38]:
df = df.rename(index={'a': 'row1', 'c': 'row2'})
print(df)

       Name  Age  Location
row1  Alice   25  New York
row2    Mei   35  Shanghai


In [39]:
df.columns = ['FullName', 'Years', 'CityName']
print(df)

     FullName  Years  CityName
row1    Alice     25  New York
row2      Mei     35  Shanghai


Sorting

In [40]:
df = pd.DataFrame({
    'Name': ['Alice', 'Kumar', 'Mei'],
    'Age': [25, 40, 35],
    'City': ['New York', 'Mumbai', 'Shanghai']
}, index=['a', 'b', 'c'])

# Sort by Age
print(df.sort_values(by='Age'))

    Name  Age      City
a  Alice   25  New York
c    Mei   35  Shanghai
b  Kumar   40    Mumbai


In [41]:
print(df.sort_values(by='Age', ascending=False))

    Name  Age      City
b  Kumar   40    Mumbai
c    Mei   35  Shanghai
a  Alice   25  New York


In [42]:
print(df.sort_values(by=['City', 'Age']))

    Name  Age      City
b  Kumar   40    Mumbai
a  Alice   25  New York
c    Mei   35  Shanghai


In [43]:
print(df.nlargest(2, 'Age'))

    Name  Age      City
b  Kumar   40    Mumbai
c    Mei   35  Shanghai


In [44]:
print(df.sort_index())

    Name  Age      City
a  Alice   25  New York
b  Kumar   40    Mumbai
c    Mei   35  Shanghai


In [45]:
print(df.sort_index(axis=1))

   Age      City   Name
a   25  New York  Alice
b   40    Mumbai  Kumar
c   35  Shanghai    Mei


In [46]:
df.sort_values(by='Age', inplace=True)
print(df)

    Name  Age      City
a  Alice   25  New York
c    Mei   35  Shanghai
b  Kumar   40    Mumbai


Mapping and Applying Functions

In [47]:
s = pd.Series([1, 2, 3])
s.map(lambda x: x ** 2)

0    1
1    4
2    9
dtype: int64

In [48]:
s = pd.Series(['cat', 'dog', 'rabbit'])
s.map({'cat': 'meow', 'dog': 'woof'})

0    meow
1    woof
2     NaN
dtype: object

In [49]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

print(df.map(lambda x: x ** 2))

   A   B
0  1  16
1  4  25
2  9  36


In [50]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

# Apply a function to each column
df.apply(np.mean)

A    2.0
B    5.0
dtype: float64

In [51]:
df.apply(lambda row: row['A'] + row['B'], axis=1)

0    5
1    7
2    9
dtype: int64