# W3 Tutorial Missing Data

In [1]:
# Import EDA & visualisation packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Configure default colour scheme for seaborn
sns.set(color_codes=True)

# Display all columns of the pandas df
pd.set_option('display.max_columns', None)

In [2]:
# Suppress warning messages
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Read CSV file into Pandas DataFrame (df)
df = pd.read_csv('employees.csv')

In [4]:
# Return the first n rows
df.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Maria,Female,130590.0,11.858,False,Finance
3,Jerry,Male,,9.34,True,Finance
4,Larry,Male,101004.0,1.389,True,Client Services


In python missing data is returned as NaN = "Not a Number" i.e. missing

In [5]:
# Print the dimensionality of the df
df.shape

(1000, 6)

In [6]:
# Print a concise summary of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   First Name         931 non-null    object
 1   Gender             852 non-null    object
 2   Salary             998 non-null    object
 3   Bonus %            997 non-null    object
 4   Senior Management  932 non-null    object
 5   Team               957 non-null    object
dtypes: object(6)
memory usage: 47.0+ KB


Can see that there are missing values in each of the columns and the datatype is labelled as 'object' which tells us we have misformed data in the dataset

In [7]:
# Generate descriptive statistics for numerical attributes
df.describe()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
count,931,852,998,997.0,932,957
unique,201,3,993,968.0,4,13
top,Marilyn,Female,91462,8.999,TRUE,Client Services
freq,11,428,2,3.0,467,105


In [8]:
# Total number of rows and columns
df.shape

# Rows containing duplicate data
duplicate_rows_df = df[df.duplicated()]
print('number of duplicate rows: ', duplicate_rows_df.shape)

number of duplicate rows:  (0, 6)


In [9]:
# Finding the null values.
print(df.isnull().sum())

First Name            69
Gender               148
Salary                 2
Bonus %                3
Senior Management     68
Team                  43
dtype: int64


In [10]:
df['Gender'].head(10)

0      Male
1      Male
2    Female
3      Male
4      Male
5      n.a.
6    Female
7    Female
8       NaN
9    Female
Name: Gender, dtype: object

In [11]:
df['Gender'].unique()

array(['Male', 'Female', 'n.a.', nan], dtype=object)

In [12]:
missing_value_formats = ['n.a.', '?', 'NA', 'n/a', 'na', '--']

In [13]:
df = pd.read_csv('employees.csv', na_values = missing_value_formats)

In [14]:
df['Gender'].head(10)

0      Male
1      Male
2    Female
3      Male
4      Male
5       NaN
6    Female
7    Female
8       NaN
9    Female
Name: Gender, dtype: object

In [15]:
# Function tries to convert values to an int, if it can't then it will return 'NaN'
def make_int(i):
    try:
        return int(i)
    except:
        return np.nan

In [16]:
df['Salary'] = df['Salary'].map(make_int)

In [17]:
df['Salary'].head(10)

0     97308.0
1     61933.0
2    130590.0
3         NaN
4    101004.0
5    115163.0
6     65476.0
7     45906.0
8         NaN
9    139852.0
Name: Salary, dtype: float64

In [18]:
# Finding the null values.
print(df.isnull().sum())

First Name            70
Gender               149
Salary                 5
Bonus %                4
Senior Management     71
Team                  48
dtype: int64


In [22]:
# Don't think this code is used, ignore
new_df = df.dropna(axis = 0), how = 'any')

new_df.shape

SyntaxError: cannot assign to function call (2241118156.py, line 1)

In [20]:
df['Gender'] = df['Gender'].fillna('Unknown')

In [21]:
df['Gender'].isnull().sum()

0

In [23]:
df['Salary'].isnull().sum()

5

In [24]:
df['Salary'].fillna(method = 'pad')

0       97308.0
1       61933.0
2      130590.0
3      130590.0
4      101004.0
         ...   
995    132483.0
996     42392.0
997     96914.0
998     60500.0
999    129949.0
Name: Salary, Length: 1000, dtype: float64

In [25]:
df['Salary'].fillna(method = 'bfill')

0       97308.0
1       61933.0
2      130590.0
3      101004.0
4      101004.0
         ...   
995    132483.0
996     42392.0
997     96914.0
998     60500.0
999    129949.0
Name: Salary, Length: 1000, dtype: float64

In [26]:
df['Salary'].fillna(df['Salary'].median())

0       97308.0
1       61933.0
2      130590.0
3       90370.0
4      101004.0
         ...   
995    132483.0
996     42392.0
997     96914.0
998     60500.0
999    129949.0
Name: Salary, Length: 1000, dtype: float64

In [27]:
df['Salary'].fillna(df['Salary'].mean())

0       97308.000000
1       61933.000000
2      130590.000000
3       90522.329648
4      101004.000000
           ...      
995    132483.000000
996     42392.000000
997     96914.000000
998     60500.000000
999    129949.000000
Name: Salary, Length: 1000, dtype: float64

In [28]:
df['Salary'].interpolate(method = 'linear', direction = 'forward')

0       97308.0
1       61933.0
2      130590.0
3      115797.0
4      101004.0
         ...   
995    132483.0
996     42392.0
997     96914.0
998     60500.0
999    129949.0
Name: Salary, Length: 1000, dtype: float64