# Introduction to Pandas

In [1]:
# sample data

data = {
    "Name" : [ 'Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    "Age" : [ 25, 30, 35, 40, 45 ],
    "City" : [ 'New York', 'Los Angeles', 'Chicago',  'Houston',
              'Phoenix' ]
}
    
            

In [2]:
data

{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
 'Age': [25, 30, 35, 40, 45],
 'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']}

In [3]:
type(data)

dict

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame(data)

In [6]:
df.head()

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston
4,Eva,45,Phoenix


In [7]:
type(df)

pandas.core.frame.DataFrame

In [8]:
df.shape

(5, 3)

In [9]:
df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [10]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [11]:
df.dtypes

Name    object
Age      int64
City    object
dtype: object

## Data Series

In [12]:
df.Age

0    25
1    30
2    35
3    40
4    45
Name: Age, dtype: int64

In [13]:
type(df.Age)

pandas.core.series.Series

In [14]:
type(df)

pandas.core.frame.DataFrame

## Data Cleaning

In [15]:
import numpy as np

In [16]:
# create a dataframe with missing values

data = {
    "Name": ['Alice', 'Bob', None, 'David', 'Eva'], 
    "Age" : [ 25, None, 35, 40, 'Unknown' ],
    "Salary": [50000,  60000, 55000, None, 45000],
    "City": ['New York', 'Los Angeles', np.nan, 'Houston',
             'Phoenix' ]
}

In [17]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25,50000.0,New York
1,Bob,,60000.0,Los Angeles
2,,35,55000.0,
3,David,40,,Houston
4,Eva,Unknown,45000.0,Phoenix


In [18]:
df['Name'].fillna('Unknown', inplace=True)
df['City'].fillna('Unknown', inplace=True)
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25,50000.0,New York
1,Bob,,60000.0,Los Angeles
2,Unknown,35,55000.0,Unknown
3,David,40,,Houston
4,Eva,Unknown,45000.0,Phoenix


In [19]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

In [20]:
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,,60000.0,Los Angeles
2,Unknown,35.0,55000.0,Unknown
3,David,40.0,,Houston
4,Eva,,45000.0,Phoenix


In [21]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,33.333333,60000.0,Los Angeles
2,Unknown,35.0,55000.0,Unknown
3,David,40.0,52500.0,Houston
4,Eva,33.333333,45000.0,Phoenix


## Handling dtypes

In [23]:
fips_codes = {
    'New York': '36061',
    'Los Angeles': '06037',
    'Unknown': '00000',
    'Houston': '48201',
    'Phoenix': '04013'
}
fips_codes

{'New York': '36061',
 'Los Angeles': '06037',
 'Unknown': '00000',
 'Houston': '48201',
 'Phoenix': '04013'}

In [24]:
df

Unnamed: 0,Name,Age,Salary,City
0,Alice,25.0,50000.0,New York
1,Bob,33.333333,60000.0,Los Angeles
2,Unknown,35.0,55000.0,Unknown
3,David,40.0,52500.0,Houston
4,Eva,33.333333,45000.0,Phoenix


In [25]:
df['FIPS'] = df['City'].map(fips_codes)
df

Unnamed: 0,Name,Age,Salary,City,FIPS
0,Alice,25.0,50000.0,New York,36061
1,Bob,33.333333,60000.0,Los Angeles,6037
2,Unknown,35.0,55000.0,Unknown,0
3,David,40.0,52500.0,Houston,48201
4,Eva,33.333333,45000.0,Phoenix,4013


In [26]:
fips_int = df.FIPS.astype(int)
fips_int
    

0    36061
1     6037
2        0
3    48201
4     4013
Name: FIPS, dtype: int64

In [27]:
df.FIPS

0    36061
1    06037
2    00000
3    48201
4    04013
Name: FIPS, dtype: object

In [28]:
df['fips_int'] = fips_int

In [29]:
df.head()

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int
0,Alice,25.0,50000.0,New York,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0
3,David,40.0,52500.0,Houston,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013


In [30]:
df['new_fips'] = df.fips_int.astype(str).str.zfill(5)
df.head()

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


## Indexing

In [31]:
df

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [32]:
df.loc[3]

Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object

In [33]:
df.iloc[3]

Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object

In [34]:
df

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [36]:
df.sort_values(by='Name')

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013
2,Unknown,35.0,55000.0,Unknown,0,0,0


In [37]:
dfsorted = df.sort_values(by='Name')
dfsorted

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013
2,Unknown,35.0,55000.0,Unknown,0,0,0


In [38]:
dfsorted.loc[3]

Name          David
Age            40.0
Salary      52500.0
City        Houston
FIPS          48201
fips_int      48201
new_fips      48201
Name: 3, dtype: object

In [39]:
dfsorted.iloc[3]

Name              Eva
Age         33.333333
Salary        45000.0
City          Phoenix
FIPS            04013
fips_int         4013
new_fips        04013
Name: 4, dtype: object

In [40]:
dfsorted

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013
2,Unknown,35.0,55000.0,Unknown,0,0,0


In [41]:
dfsorted.reset_index(inplace=True, drop=True)
dfsorted

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,David,40.0,52500.0,Houston,48201,48201,48201
3,Eva,33.333333,45000.0,Phoenix,4013,4013,4013
4,Unknown,35.0,55000.0,Unknown,0,0,0


In [42]:
df.iloc[[0,1], [2, 3]]

Unnamed: 0,Salary,City
0,50000.0,New York
1,60000.0,Los Angeles


In [44]:
df.iloc[[0,1], [0, 4]]

Unnamed: 0,Name,FIPS
0,Alice,36061
1,Bob,6037


In [45]:
df.iloc[[0,2]]

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
2,Unknown,35.0,55000.0,Unknown,0,0,0


In [47]:
df.iloc[[0,2]].loc[:, ['City','FIPS']]

Unnamed: 0,City,FIPS
0,New York,36061
2,Unknown,0


## Filtering

In [48]:
df

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [49]:
df[df.Salary > 50000]

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201


In [51]:
df.Salary > 50000

0    False
1     True
2     True
3     True
4    False
Name: Salary, dtype: bool

In [52]:
df[df.Age >= 35]

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201


In [53]:
df[ (df['Age'] >= 35) & (df['Salary'] > 53000) ]

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
2,Unknown,35.0,55000.0,Unknown,0,0,0


In [54]:
df[ (df['Age'] >= 35) | (df['Salary'] > 53000) ]

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201


## Dropping

In [55]:
df

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
2,Unknown,35.0,55000.0,Unknown,0,0,0
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [63]:
df[df.Name != 'Unknown']

Unnamed: 0,Name,Age,Salary,City,FIPS,fips_int,new_fips
0,Alice,25.0,50000.0,New York,36061,36061,36061
1,Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
3,David,40.0,52500.0,Houston,48201,48201,48201
4,Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [64]:
df.set_index('Name', inplace=True)
df

Unnamed: 0_level_0,Age,Salary,City,FIPS,fips_int,new_fips
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alice,25.0,50000.0,New York,36061,36061,36061
Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
Unknown,35.0,55000.0,Unknown,0,0,0
David,40.0,52500.0,Houston,48201,48201,48201
Eva,33.333333,45000.0,Phoenix,4013,4013,4013


In [65]:
df.drop(index='Unknown')

Unnamed: 0_level_0,Age,Salary,City,FIPS,fips_int,new_fips
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alice,25.0,50000.0,New York,36061,36061,36061
Bob,33.333333,60000.0,Los Angeles,6037,6037,6037
David,40.0,52500.0,Houston,48201,48201,48201
Eva,33.333333,45000.0,Phoenix,4013,4013,4013
