In [1]:
import pandas as pd

In [4]:
# extracting the data

users_data = pd.read_pickle("../data/users_data_final.pkl")


In [5]:
type(users_data)


pandas.core.frame.DataFrame

In [7]:
# preview the data
users_data.head()

Unnamed: 0,user_id,number_transactions,total_amount_usd,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,term_deposit,date_joined,device
0,9231c446-cb16-4b2b-a7f7-ddfc8b25aaf6,3.0,2143.0,58,management,married,tertiary,False,True,False,,261,1,-1,0,,False,1998-08-23,mobile
1,bb92765a-08de-4963-b432-496524b39157,,,44,technician,single,secondary,False,True,False,,151,1,-1,0,,False,2008-07-15,desktop
2,573de577-49ef-42b9-83da-d3cfb817b5c1,2.0,2.0,33,entrepreneur,married,secondary,False,True,True,,76,1,-1,0,,False,2002-06-04,mobile
3,d6b66b9d-7c8f-4257-a682-e136f640b7e3,,,47,blue-collar,married,,False,True,False,,92,1,-1,0,,False,1995-06-29,tablet
4,fade0b20-7594-4d9a-84cd-c02f79b1b526,1.0,1.0,33,,single,,False,False,False,,198,1,-1,0,,False,1995-08-01,mobile


In [8]:
# shape of the data
users_data.shape

(45209, 19)

In [9]:
# structure of the data
users_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45209 entries, 0 to 45215
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   user_id              45209 non-null  object        
 1   number_transactions  35210 non-null  float64       
 2   total_amount_usd     35210 non-null  float64       
 3   age                  45209 non-null  int64         
 4   job                  44921 non-null  object        
 5   marital              45209 non-null  object        
 6   education            43352 non-null  object        
 7   default              45209 non-null  bool          
 8   housing              45209 non-null  bool          
 9   loan                 45209 non-null  bool          
 10  contact              32191 non-null  object        
 11  duration             45209 non-null  int64         
 12  campaign             45209 non-null  int64         
 13  pdays                45209 non-null 

In [10]:
# if I want to look at the age of the data frame
# option 1
users_data.age

0        58
1        44
2        33
3        47
4        33
         ..
45211    51
45212    71
45213    72
45214    57
45215    37
Name: age, Length: 45209, dtype: int64

In [11]:
# option 2
users_data["age"]

0        58
1        44
2        33
3        47
4        33
         ..
45211    51
45212    71
45213    72
45214    57
45215    37
Name: age, Length: 45209, dtype: int64

In [14]:
# question: what is the average age of the users
# option 1 - use describe

users_data.age.describe()

count    45209.000000
mean        40.935853
std         10.618653
min         18.000000
25%         33.000000
50%         39.000000
75%         48.000000
max         95.000000
Name: age, dtype: float64

In [18]:
# the average age 

avg_age = round(users_data.age.mean())
avg_age

41

In [21]:
# let's subset the data frame based on the condition - where users are older than the average age
ud_above_avg_age = users_data[users_data.age > avg_age]
ud_above_avg_age.head()

Unnamed: 0,user_id,number_transactions,total_amount_usd,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,poutcome,term_deposit,date_joined,device
0,9231c446-cb16-4b2b-a7f7-ddfc8b25aaf6,3.0,2143.0,58,management,married,tertiary,False,True,False,,261,1,-1,0,,False,1998-08-23,mobile
1,bb92765a-08de-4963-b432-496524b39157,,,44,technician,single,secondary,False,True,False,,151,1,-1,0,,False,2008-07-15,desktop
3,d6b66b9d-7c8f-4257-a682-e136f640b7e3,,,47,blue-collar,married,,False,True,False,,92,1,-1,0,,False,1995-06-29,tablet
7,d20059f3-84b7-4ec5-b933-8bca3a3414af,1.0,2.0,42,entrepreneur,divorced,tertiary,True,True,False,,380,1,-1,0,,False,1991-12-09,mobile
8,0cedabc3-6141-43c6-988c-9ec2f14a8eb9,,,58,retired,married,primary,False,True,False,,50,1,-1,0,,False,1999-08-20,desktop


In [23]:
# how many customers are above the average age

ud_above_avg_age.shape[0]

19202

In [25]:
# check if there are any duplicated rows of data

users_data.duplicated().sum()

0

In [26]:
# investigate missing data
# sum() gives the absolute values
users_data.isnull().sum()

user_id                    0
number_transactions     9999
total_amount_usd        9999
age                        0
job                      288
marital                    0
education               1857
default                    0
housing                    0
loan                       0
contact                13018
duration                   0
campaign                   0
pdays                      0
previous                   0
poutcome               36957
term_deposit               0
date_joined               30
device                    94
dtype: int64

In [27]:
# mean() gives the % of missing values

users_data.isnull().mean()

user_id                0.000000
number_transactions    0.221173
total_amount_usd       0.221173
age                    0.000000
job                    0.006370
marital                0.000000
education              0.041076
default                0.000000
housing                0.000000
loan                   0.000000
contact                0.287952
duration               0.000000
campaign               0.000000
pdays                  0.000000
previous               0.000000
poutcome               0.817470
term_deposit           0.000000
date_joined            0.000664
device                 0.002079
dtype: float64

In [28]:
def identify_missing_data(df):
    """
    This function is used to identify missing data
    
    @param df pandas DataFrame
    
    @return a DataFrame with the percentage of missing data for every feature and the data types
    """
    
    percent_missing = df.isnull().mean()
    
    missing_value_df = pd.DataFrame(percent_missing).reset_index() # convert to DataFrame
    missing_value_df = missing_value_df.rename(columns = {"index" : "variable",
                                                                0 : "percent_missing"}) # rename columns

    missing_value_df = missing_value_df.sort_values(by = ['percent_missing'], ascending = False) # sort the values
    
    data_types_df = pd.DataFrame(df.dtypes).reset_index().rename(columns = {"index" : "variable",
                                                                0 : "data_type"}) # rename columns
    
    missing_value_df = missing_value_df.merge(data_types_df, on = "variable") # join the dataframe with datatype
    
    missing_value_df.percent_missing = round(missing_value_df.percent_missing*100, 2) # format the percent_missing
    
    return missing_value_df[missing_value_df.percent_missing > 0]

In [29]:
identify_missing_data(users_data)

Unnamed: 0,variable,percent_missing,data_type
0,poutcome,81.75,object
1,contact,28.8,object
2,total_amount_usd,22.12,float64
3,number_transactions,22.12,float64
4,education,4.11,object
5,job,0.64,object
6,device,0.21,object
7,date_joined,0.07,datetime64[ns]


In [31]:
type(users_data)

pandas.core.frame.DataFrame

In [32]:
fruit = ["apple", "banana"]
type(fruit)

list

In [55]:
# removing columns that had more than 70% missing data
users_data_cleaned = users_data.drop(['poutcome'], axis=1)

In [46]:
print(users_data.shape)
print(users_data_cleaned.shape)

(45209, 19)
(45209, 18)


In [47]:
identify_missing_data(users_data_cleaned)

Unnamed: 0,variable,percent_missing,data_type
0,contact,28.8,object
1,total_amount_usd,22.12,float64
2,number_transactions,22.12,float64
3,education,4.11,object
4,job,0.64,object
5,device,0.21,object
6,date_joined,0.07,datetime64[ns]


In [53]:
# removing rows of data where customers did not have a date joined 

# option 1: subset the data frame where date joined is not null
users_data_cleaned = users_data_cleaned[~users_data_cleaned.date_joined.isnull()]

In [54]:
print(users_data.shape)
print(users_data_cleaned.shape)

(45209, 19)
(45179, 18)


In [57]:
# option 2: use the drop na function to identify rows where date joined is not null
users_data_cleaned.dropna(subset=['date_joined'], inplace=True)

In [58]:
print(users_data.shape)
print(users_data_cleaned.shape)

(45209, 19)
(45179, 18)


In [69]:
# loading the data
import boto3
from io import StringIO 

# make a connection to s3

def connect_to_s3(aws_access_key_id, aws_secret_access_key):
    """Methods that connects to s3"""

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )

    print("Connection to s3 made")

    return s3_client

In [67]:
# creating variables that contain connection info

aws_access_key_id=""
aws_secret_access_key=""
aws_s3_bucket="sep-bootcamp"


In [71]:
def load_df_to_s3(df, aws_s3_bucket, key, aws_access_key_id, aws_secret_access_key):
    """Function that writes a data frame as a .csv file to a s3 bucket"""
    
    s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)
    
    csv_buffer = StringIO() # create buffer to temporarily store the Data Frame
    df.to_csv(csv_buffer, index=False) # code to write the data frame as csv file
    response = s3_client.put_object(
            Bucket=aws_s3_bucket, Key=key, Body=csv_buffer.getvalue()
        ) # this code writes the temp stored csv file and writes to s3
    print("This data has been loaded")

In [76]:
key = "transformations_monday/sh_users_data_cleaned.csv"

load_df_to_s3(users_data_cleaned, aws_s3_bucket, key, aws_access_key_id, aws_secret_access_key)

Connection to s3 made
This data has been loaded


In [77]:
# read a file from s3

# specify the key
key = "transformations_monday/sh_users_data_cleaned.csv" 

# make a connection to s3 and get the file from the bucket and key you specify
s3_client = connect_to_s3(aws_access_key_id, aws_secret_access_key)
response = s3_client.get_object(Bucket=aws_s3_bucket, Key=key)
my_file = pd.read_csv(response.get("Body"))

Connection to s3 made


In [78]:
my_file.head()

Unnamed: 0,user_id,number_transactions,total_amount_usd,age,job,marital,education,default,housing,loan,contact,duration,campaign,pdays,previous,term_deposit,date_joined,device
0,9231c446-cb16-4b2b-a7f7-ddfc8b25aaf6,3.0,2143.0,58,management,married,tertiary,False,True,False,,261,1,-1,0,False,1998-08-23,mobile
1,bb92765a-08de-4963-b432-496524b39157,,,44,technician,single,secondary,False,True,False,,151,1,-1,0,False,2008-07-15,desktop
2,573de577-49ef-42b9-83da-d3cfb817b5c1,2.0,2.0,33,entrepreneur,married,secondary,False,True,True,,76,1,-1,0,False,2002-06-04,mobile
3,d6b66b9d-7c8f-4257-a682-e136f640b7e3,,,47,blue-collar,married,,False,True,False,,92,1,-1,0,False,1995-06-29,tablet
4,fade0b20-7594-4d9a-84cd-c02f79b1b526,1.0,1.0,33,,single,,False,False,False,,198,1,-1,0,False,1995-08-01,mobile
