In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Importing the data
df_users_original = pd.read_csv('data/users_table.csv')
df_country_original = pd.read_csv('data/country_table.csv')
df_events_original = pd.read_csv('data/events_table.csv')
df_testing_original = pd.read_csv('data/testing_table.csv')

FUNCTIONS FOR DATA EXPLORATION

In [33]:
def table_overlook(table):
    '''function to display relevant info in a table'''
    print("\n\nTHE FIRST 5 ROWS:\n")
    print(table.head())  

    print("\n\nTABLE SHAPE:\n")
    print(table.shape)  

    print("\n\nTABLE INFO:\n")
    print(table.info())  

In [41]:
def duplicate_check(table):
    '''function to check duplicate values within columns of a table'''
    print(table.loc[table.duplicated()])

In [46]:
def placeholder_checker(table):
    '''It checks for placeholder values that indicate missing information'''
    placeholder_checks = table.isin(['unknown', 'na', 'n/a', 'select', 'none', '?']).sum()
    print(placeholder_checks)

USERS DATAFRAME - DATA EXPLORATION

In [6]:
# Keeping an original copy
df_users = df_users_original.copy()

In [34]:
table_overlook(df_users)



THE FIRST 5 ROWS:

   user_id gender  country_id        age
0        1      F           2  38.511265
1        2      F           2  45.765914
2        3    NaN           3  49.706807
3        4    NaN           1  25.539470
4        5      F           2  39.200308


TABLE SHAPE:

(987976, 4)


TABLE INFO:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987976 entries, 0 to 987975
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   user_id     987976 non-null  int64  
 1   gender      741692 non-null  object 
 2   country_id  987976 non-null  int64  
 3   age         987976 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 30.2+ MB
None


We can already see the presence of Null values in the Gender column. The data type though is matching the expected format.

In [42]:
duplicate_check(df_users)

Empty DataFrame
Columns: [user_id, gender, country_id, age]
Index: []


No duplicates in this tables.

In [47]:
# Checking for placeholder values that indicate missing information
placeholder_checker(df_users)

user_id       0
gender        0
country_id    0
age           0
dtype: int64


Everything seems alright, except the NaN values in the Gender column. Let's do a last check for invalid entries in the Age column, for example values that are below 0 or above 100.

In [48]:
invalid_age_entries = df_users[(df_users['age'] < 0) | (df_users['age'] > 100)]
invalid_age_entries

Unnamed: 0,user_id,gender,country_id,age


We are done with exploration for the users dataframe, we can move to data cleaning. 

USERS DATAFRAME - DATA CLEANING

The only thing to do here is to fix the presence of NaN values.

In [60]:
# Checking how many NaN we have

print("Null values:", df_users['gender'].isna().sum())
print("Percentage:", round((df_users['gender'].isna().sum()/len(df_users))*100, 2), "%\n")

Null values: 246284
Percentage: 24.93 %

