## HANDLING MISSING VALUES AND CASTING DATATYPES

In [1]:
import pandas as pd
import numpy as np

In [21]:
people = {
    "first": ["Corey", 'Jane', 'John','Chris',np.nan,None,'NA'],
    "last": ["Schafer", 'Doe', 'Doe','Schafer',np.nan,np.nan,'Missing'],
    "email": ["CoreyMSchafer@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com',None,np.nan,'Anonymous@email.com','Missing'],'age':['33','55','63','36',None,None, 'Missing']
}

In [22]:
df = pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,Missing,Missing


In [23]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,Missing,Missing


In [24]:
df.dropna(axis ='index',how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,Missing,Missing


In [26]:
df.dropna(axis ='columns',how='any')

0
1
2
3
4
5
6


In [27]:
df.dropna(axis ='index',how='all',subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,Missing,Missing


In [28]:
df.dropna(axis ='index',how='all',subset=['last','email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,Missing,Missing


In [29]:
df = pd.DataFrame(people)


df.replace('NA',np.nan, inplace=True)
df.replace('Missing',np.nan, inplace=True)

In [30]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [31]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [32]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [33]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [34]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [35]:
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [36]:
type(np.nan)

float

In [37]:
df['age']=df['age'].astype(int)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [38]:
df['age']=df['age'].astype(float)

In [39]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [40]:
df['age'].mean()

46.75

In [43]:
na_vals =['NA','Missing']
df = pd.read_csv("survey_results_public.csv",index_col='Respondent',na_values = na_vals)
schema_df = pd.read_csv('survey_results_schema.csv',index_col='Column')

In [44]:
pd.set_option('display.max_columns',85)
pd.set_option('display.max_rows',85)

In [45]:
df['YearsCode'].head(10)

Respondent
1       4
2     NaN
3       3
4       3
5      16
6      13
7       6
8       8
9      12
10     12
Name: YearsCode, dtype: object

In [46]:
df['YearsCode'].mean()

TypeError: can only concatenate str (not "int") to str

In [47]:
df['YearsCode'].unique()

array(['4', nan, '3', '16', '13', '6', '8', '12', '2', '5', '17', '10',
       '14', '35', '7', 'Less than 1 year', '30', '9', '26', '40', '19',
       '15', '20', '28', '25', '1', '22', '11', '33', '50', '41', '18',
       '34', '24', '23', '42', '27', '21', '36', '32', '39', '38', '31',
       '37', 'More than 50 years', '29', '44', '45', '48', '46', '43',
       '47', '49'], dtype=object)

In [49]:
df['YearsCode'].replace('More than 50 years',51,inplace=True)

In [50]:
df['YearsCode'] =df['YearsCode'].astype(float)

In [51]:
df['YearsCode'].mean()

11.662114216834588

In [52]:
df['YearsCode'].mean()

11.662114216834588