# Source Analysis
### In this Notebook I'm going to use Pandas to understand and perform some transformations.



# Imports

In [None]:
import pandas as pd
import json
from numpy import nan as Nan
import hashlib
import matplotlib.pyplot as plt

# Read url responses

In [None]:
url_users='https://619ca0ea68ebaa001753c9b0.mockapi.io/evaluation/dataengineer/jr/v1/users'
df_users = pd.read_json(url_users)
df_users

# I decided to create a separated table from "subscription" column, that has arrays with objects inside. For the "profile" column I decided to explode the object inside and add it to the main "users" table.

In [None]:
df_users.info()

In [None]:
url_messages='https://619ca0ea68ebaa001753c9b0.mockapi.io/evaluation/dataengineer/jr/v1/messages'    
df_messages=pd.read_json(url_messages)
df_messages

In [None]:
df_messages.info()

# Functions to read the file from url and save it into a dataframe 

In [None]:
def users():
    url_users='https://619ca0ea68ebaa001753c9b0.mockapi.io/evaluation/dataengineer/jr/v1/users'
    df_users = pd.read_json(url_users)
    profile_dict=df_users['profile']
    df_profile = pd.DataFrame([x for x in profile_dict])
    df_users=pd.concat([df_users, df_profile], axis=1)
    df_users=df_users[['id','createdAt','updatedAt','firstName','lastName','address','city','country','zipCode','email','birthDate','gender','isSmoking','profession','income']]
    df_users=df_users.rename(columns={"id": "user_id"})
    
    # Apply hashing function to the column in order to hide PII
    
    df_users[['firstName','lastName','address','birthDate']] = df_users[['firstName','lastName','address','birthDate']].astype(str)
    columns=['firstName','lastName','address','birthDate']
    for column in columns:
        df_users[column] = df_users[column].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
    
    #extract domain from emails
    df_users['email']=df_users['email'].str.extract('((?<=@).*)') 
    
    return df_users

In [None]:
users()

In [None]:
plt.figure(figsize=(16,8))
# plot chart
df=users()
df['income']=df['income'].astype(float)
df.groupby(['country']).sum().plot(kind='pie', y='income',title='Income by country')


In [None]:
def messages():
    url_messages='https://619ca0ea68ebaa001753c9b0.mockapi.io/evaluation/dataengineer/jr/v1/messages'    
    df_messages=pd.read_json(url_messages)
    
    #hash messages
    df_messages['message']=df_messages['message'].astype(str)
    df_messages['message'] = df_messages['message'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
    
    return df_messages

In [None]:
messages()

In [None]:
def subscriptions():
    url_users='https://619ca0ea68ebaa001753c9b0.mockapi.io/evaluation/dataengineer/jr/v1/users'    
    df_users = pd.read_json(url_users)
    subscription_df=df_users[['id','subscription']]
    subscription_df = subscription_df.explode('subscription')
    df = pd.DataFrame(columns = ['createdAt', 'startDate','endDate','status','amount','id'])
    for index, row in subscription_df.iterrows():
        if row['subscription'] is not Nan:
            dicts=row['subscription']
            dicts['id']=row['id']
            df = df.append(dicts, ignore_index=True, sort=False)
        else:
            df2 = {'createdAt': Nan, 'startDate': Nan, 'endDate':Nan, 'status': Nan, 'amount': Nan, 'id': row['id'], }
            df = df.append(df2, ignore_index = True)
    df=df.rename(columns={"id": "user_id"})
    return df

In [None]:
subscriptions()