Transformation Tasks
- Identifying and handling duplicated data
- Identifying and handling missing values

In [1]:
# libraries we need for today

import pandas as pd


In [2]:
# read the data

users_data = pd.read_pickle("../data/users_data_final.pkl")
users_data.head()

Unnamed: 0,user_id,number_transactions,total_amount_usd,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,term_deposit,date_joined,device
0,9231c446-cb16-4b2b-a7f7-ddfc8b25aaf6,3.0,2143.0,58,management,married,tertiary,False,True,False,,261,1,-1,0,,False,1998-08-23,mobile
1,bb92765a-08de-4963-b432-496524b39157,,,44,technician,single,secondary,False,True,False,,151,1,-1,0,,False,2008-07-15,desktop
2,573de577-49ef-42b9-83da-d3cfb817b5c1,2.0,2.0,33,entrepreneur,married,secondary,False,True,True,,76,1,-1,0,,False,2002-06-04,mobile
3,d6b66b9d-7c8f-4257-a682-e136f640b7e3,,,47,blue-collar,married,,False,True,False,,92,1,-1,0,,False,1995-06-29,tablet
4,fade0b20-7594-4d9a-84cd-c02f79b1b526,1.0,1.0,33,,single,,False,False,False,,198,1,-1,0,,False,1995-08-01,mobile


In [3]:
# summary of the data

users_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45209 entries, 0 to 45215
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              45209 non-null  object        
 1   number_transactions  35210 non-null  float64       
 2   total_amount_usd     35210 non-null  float64       
 3   age                  45209 non-null  int64         
 4   job                  44921 non-null  object        
 5   marital              45209 non-null  object        
 6   education            43352 non-null  object        
 7   default              45209 non-null  bool          
 8   housing              45209 non-null  bool          
 9   loan                 45209 non-null  bool          
 10  contact              32191 non-null  object        
 11  duration             45209 non-null  int64         
 12  campaign             45209 non-null  int64         
 13  pdays                45209 non-null 

In [4]:
# number of numerical variables
print("# of numerical variables:",
      len(users_data.select_dtypes(['float64', 'int64', 'bool']).columns))


# of numerical variables: 11


In [5]:
# number of variables with data type object

print("# of variables with data type object:",
      len(users_data.select_dtypes(['object']).columns))


# of variables with data type object: 7


### Identifying and Removing duplicate rows

In [6]:
# Code to identify duplicated rows of data

users_data[users_data.duplicated()]

# empty data frame, so currently there is no duplicated data

Unnamed: 0,user_id,number_transactions,total_amount_usd,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,term_deposit,date_joined,device


In [7]:
users_data.duplicated().sum()

0

### Identifying and Handling Missing Data

In [8]:
# lets take this code up a level

users_data.isnull().sum()

user_id                    0
number_transactions     9999
total_amount_usd        9999
age                        0
job                      288
marital                    0
education               1857
default                    0
housing                    0
loan                       0
contact                13018
duration                   0
campaign                   0
pdays                      0
previous                   0
poutcome               36957
term_deposit               0
date_joined               30
device                    94
dtype: int64

In [9]:
def identify_missing_data(df):
    """
    This function is used to identify missing data
    
    @param df pandas DataFrame
    
    @return a DataFrame with the percentage of missing data for every feature and the data types
    """
    
    percent_missing = df.isnull().mean()
    
    missing_value_df = pd.DataFrame(percent_missing).reset_index() # convert to DataFrame
    missing_value_df = missing_value_df.rename(columns = {"index" : "variable",
                                                                0 : "percent_missing"}) # rename columns

    missing_value_df = missing_value_df.sort_values(by = ['percent_missing'], ascending = False) # sort the values
    
    data_types_df = pd.DataFrame(df.dtypes).reset_index().rename(columns = {"index" : "variable",
                                                                0 : "data_type"}) # rename columns
    
    missing_value_df = missing_value_df.merge(data_types_df, on = "variable") # join the dataframe with datatype
    
    missing_value_df.percent_missing = round(missing_value_df.percent_missing*100, 2) # format the percent_missing
    
    return missing_value_df[missing_value_df.percent_missing > 0]

In [10]:
# the function gives all variables with missing values , the data type and how much data is missing 

missing_df = identify_missing_data(users_data)
missing_df

Unnamed: 0,variable,percent_missing,data_type
0,poutcome,81.75,object
1,contact,28.8,object
2,total_amount_usd,22.12,float64
3,number_transactions,22.12,float64
4,education,4.11,object
5,job,0.64,object
6,device,0.21,object
7,date_joined,0.07,datetime64[ns]


In [11]:
print("# of variables with missing values", len(missing_df.variable))

# of variables with missing values 8


In [12]:
# number of variables with more than 70% missing data

print("number of variables with more than 70% missing data:",
      len(missing_df[missing_df.percent_missing > 70].variable))


number of variables with more than 70% missing data: 1


In [13]:
print("# of numerical variables with missing values:", 
      len(missing_df[(missing_df.data_type == "float64") | 
         (missing_df.data_type == "int64")].variable))


# of numerical variables with missing values: 2


In [14]:
print("# of non numerical variables with missing values:", 
      len(missing_df[(missing_df.data_type == "object") | 
         (missing_df.data_type == "datetime64[ns]")].variable))


# of non numerical variables with missing values: 6


In [15]:
# drop columns that have more than 70% missing data

col_to_drop = list(missing_df[missing_df.percent_missing > 70].variable)
users_data_cleaned = users_data.drop(col_to_drop, axis = 1)

In [16]:
# check what else needs handling or dropping
missing_df = identify_missing_data(users_data_cleaned)
missing_df

Unnamed: 0,variable,percent_missing,data_type
0,contact,28.8,object
1,total_amount_usd,22.12,float64
2,number_transactions,22.12,float64
3,education,4.11,object
4,job,0.64,object
5,device,0.21,object
6,date_joined,0.07,datetime64[ns]
