In [28]:
%pip install pandas 

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


### Imports

In [29]:
import pandas as pd
import os

### Data Loading

In [30]:
users = pd.read_csv('../DataGeneration/users.csv')
trainings = pd.read_csv('../DataGeneration/trainings.csv')
responses = pd.read_csv('../DataGeneration/responses.csv')
organisation_reviews = pd.read_csv('../DataGeneration/organisationReviews.csv')

In [31]:
os.makedirs('Raw', exist_ok=True)

In [32]:
users.to_csv('Raw/usersRaw.csv', index=False)
trainings.to_csv('Raw/trainingsRaw.csv', index=False)
responses.to_csv('Raw/responsesRaw.csv', index=False)
organisation_reviews.to_csv('Raw/organisationReviewsRaw.csv', index=False)

### Raw Layer

Data is being read from the Raw layer and will be prepared for the stagging layer 

##### Users

In [33]:
users_raw_df=pd.DataFrame(pd.read_csv('Raw/usersRaw.csv'))
users_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   userId       300 non-null    int64 
 1   userName     300 non-null    object
 2   email        300 non-null    object
 3   password     300 non-null    object
 4   role         300 non-null    object
 5   designation  300 non-null    object
 6   gender       300 non-null    object
 7   joinDate     300 non-null    object
dtypes: int64(1), object(7)
memory usage: 18.9+ KB


In [34]:
users_raw_df.drop('password',axis=1,inplace=True)
users_raw_df.drop('gender',axis=1,inplace=True)
users_raw_df.drop('email',axis=1,inplace=True)
users_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   userId       300 non-null    int64 
 1   userName     300 non-null    object
 2   role         300 non-null    object
 3   designation  300 non-null    object
 4   joinDate     300 non-null    object
dtypes: int64(1), object(4)
memory usage: 11.8+ KB


In [35]:
print(users_raw_df['joinDate'].dtype)
users_raw_df['joinDate']=pd.to_datetime(users_raw_df['joinDate'])
print(users_raw_df['joinDate'].dtype)
users_raw_df.info()

object
datetime64[ns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   userId       300 non-null    int64         
 1   userName     300 non-null    object        
 2   role         300 non-null    object        
 3   designation  300 non-null    object        
 4   joinDate     300 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 11.8+ KB


##### Trainings

In [45]:
trainings_raw_df=pd.DataFrame(pd.read_csv('Raw/trainingsRaw.csv'))
trainings_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   trainingId    100 non-null    int64 
 1   trainingName  100 non-null    object
 2   description   100 non-null    object
 3   domainName    100 non-null    object
 4   domainId      100 non-null    int64 
 5   duration      100 non-null    int64 
 6   startDate     100 non-null    object
dtypes: int64(3), object(4)
memory usage: 5.6+ KB


In [46]:
def assignDomainId(domainName):
    if domainName == 'DataEngineering':
        return 1
    elif domainName == 'FullStack':
        return 2
    elif domainName == 'DevOps':
        return 3
    elif domainName == 'DataScience':
        return 4
    elif domainName =='SoftSkills':
        return 5


trainings_raw_df['domainId'] = trainings_raw_df['domainName'].apply(assignDomainId)

In [48]:
trainings_raw_df.info()
trainings_raw_df.drop('description',axis=1,inplace=True)
trainings_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   trainingId    100 non-null    int64 
 1   trainingName  100 non-null    object
 2   description   100 non-null    object
 3   domainName    100 non-null    object
 4   domainId      100 non-null    int64 
 5   duration      100 non-null    int64 
 6   startDate     100 non-null    object
dtypes: int64(3), object(4)
memory usage: 5.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   trainingId    100 non-null    int64 
 1   trainingName  100 non-null    object
 2   domainName    100 non-null    object
 3   domainId      100 non-null    int64 
 4   duration      100 non-null    int64 
 5   startDate     100 non-null    object
dtypes: int64(3), object(3)
me

In [50]:
trainings_raw_df['startDate']=pd.to_datetime(trainings_raw_df['startDate'])
trainings_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   trainingId    100 non-null    int64         
 1   trainingName  100 non-null    object        
 2   domainName    100 non-null    object        
 3   domainId      100 non-null    int64         
 4   duration      100 non-null    int64         
 5   startDate     100 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 4.8+ KB


##### Responses 

In [61]:
responses_raw_df=pd.DataFrame(pd.read_csv('Raw/responsesRaw.csv'))
responses_raw_df['userId'].nunique()

300

In [72]:
pd.to_datetime(responses_raw_df['responseDate'])
print(responses_raw_df['responseDate'].dtypes)

object
