# Working With Types
## Casting Types & Missing Values

In [1]:
import pandas as pd
houses = pd.read_csv("data/kc_house_data.csv")
titanic = pd.read_csv("data/titanic.csv")
netflix = pd.read_csv("data/netflix_titles.csv", sep="|", index_col=0)
btc = pd.read_csv("data/coin_Bitcoin.csv")
countries = pd.read_csv("data/world-happiness-report-2021.csv")

## Casting With astype()

In [7]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [11]:
titanic['age'].replace(['?'], [None], inplace=True)

In [12]:
titanic['age'].astype("float")

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

In [15]:
titanic['age'] = titanic['age'].astype("float")

In [16]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB


In [17]:
titanic['sex'].astype('category')

0       female
1         male
2       female
3         male
4       female
         ...  
1304    female
1305    female
1306      male
1307      male
1308      male
Name: sex, Length: 1309, dtype: category
Categories (2, object): ['female', 'male']

In [19]:
titanic.sex = titanic['sex'].astype('category')

In [20]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   object  
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(1), float64(1), int64(4), object(8)
memory usage: 134.5+ KB


In [21]:
titanic.embarked.astype('category')

0       S
1       S
2       S
3       S
4       S
       ..
1304    C
1305    C
1306    C
1307    C
1308    S
Name: embarked, Length: 1309, dtype: category
Categories (4, object): ['?', 'C', 'Q', 'S']

In [23]:
titanic['embarked'] = titanic['embarked'].astype('category')

In [24]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1309 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   category
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(2), float64(1), int64(4), object(7)
memory usage: 125.7+ KB


In [25]:
titanic = pd.read_csv('data/titanic.csv')
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [27]:
pd.to_numeric(titanic['age'], errors='coerce')

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

In [30]:
titanic.age = pd.to_numeric(titanic['age'], errors='coerce')

In [31]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1309 non-null   object 
 9   cabin      1309 non-null   object 
 10  embarked   1309 non-null   object 
 11  boat       1309 non-null   object 
 12  body       1309 non-null   object 
 13  home.dest  1309 non-null   object 
dtypes: float64(1), int64(4), object(9)
memory usage: 143.3+ KB


In [32]:
stats = pd.read_csv('data/game_stats.csv')

In [33]:
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      6 non-null      object 
 1   league    4 non-null      object 
 2   points    5 non-null      float64
 3   assists   2 non-null      float64
 4   rebounds  4 non-null      float64
dtypes: float64(3), object(2)
memory usage: 408.0+ bytes


In [34]:
stats

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,
5,steph,nba,49.0,8.0,10.0
6,,,,,


In [35]:
stats.isna()

Unnamed: 0,name,league,points,assists,rebounds
0,False,False,False,False,False
1,False,True,False,True,False
2,False,False,True,True,True
3,False,False,False,True,False
4,False,True,False,True,True
5,False,False,False,False,False
6,True,True,True,True,True


In [36]:
stats['league'].isna()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
Name: league, dtype: bool

In [37]:
stats[stats['league'].isna()]

Unnamed: 0,name,league,points,assists,rebounds
1,jessie,,10.0,,2.0
4,timothee,,8.0,,
6,,,,,


In [39]:
stats['assists'].dropna()

0    5.0
5    8.0
Name: assists, dtype: float64

In [43]:
no_na_values = stats.dropna()
no_na_values

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
5,steph,nba,49.0,8.0,10.0


In [50]:
stats.dropna(how='all', inplace=True)

In [48]:
stats.dropna(subset=['league'])

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
2,stu,euroleague,,,
3,jackson,aba,9.0,,2.0
5,steph,nba,49.0,8.0,10.0


In [59]:
stats['rebounds'] = stats['rebounds'].fillna(0)

In [60]:
stats

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,,10.0,,2.0
2,stu,euroleague,,,0.0
3,jackson,aba,9.0,,2.0
4,timothee,,8.0,,0.0
5,steph,nba,49.0,8.0,10.0


In [62]:
stats['league'].fillna('amateur',inplace=True)

In [65]:
stats.fillna(0)

Unnamed: 0,name,league,points,assists,rebounds
0,bob,nba,22.0,5.0,10.0
1,jessie,amateur,10.0,0.0,2.0
2,stu,euroleague,0.0,0.0,0.0
3,jackson,aba,9.0,0.0,2.0
4,timothee,amateur,8.0,0.0,0.0
5,steph,nba,49.0,8.0,10.0


In [66]:
sales = pd.read_csv('data/sales.csv')

In [67]:
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,,92660.0
5,,,
6,,60007.0,60007.0


In [69]:
sales['shipping_zip'].fillna(sales['billing_zip'], inplace=True)

In [73]:
sales.dropna(how='all', inplace=True)

In [74]:
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,81220.0,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,92660.0,92660.0
6,,60007.0,60007.0
