In this notebook, we aim to generate more data using `faker` (`pip install faker`) and use it for testing our model. 

we use the final dataset `final_wo_errors`

 **Everytime we run this notebook , a new csv file with new data will be saved and override the old data. This means team members will generate different data when working separately**

In [1]:
import numpy as np
import pandas as pd
from faker import Faker
from faker.providers import BaseProvider
import random

## 00. Preparing the data frame for faker 

In [2]:
df_original = pd.read_pickle('../data/pickle/final_wo_errors.pkl')
df_original.shape
df_original.columns

Index(['project_name', 'description_aims', 'rating_comment', 'start_date',
       'end_date', 'schedule_comment', 'yearly_budget', 'yearly_forecast',
       'wlc_baseline_incl_NCG', 'variance_comment', 'budget_comment', 'year',
       'report_category', 'project_number', 'total_benefits',
       'benefits_comment', 'department_CO', 'department_CPS', 'department_DBT',
       'department_DCMS', 'department_DEFRA', 'department_DEFRA & DFT',
       'department_DESNZ', 'department_DFE', 'department_DFID',
       'department_DFT', 'department_DHSC', 'department_DLUHC',
       'department_DSIT', 'department_DWP', 'department_FCDO',
       'department_HMLR', 'department_HMRC', 'department_HMT', 'department_HO',
       'department_MOD', 'department_MOJ', 'department_NCA', 'department_NS&I',
       'department_ONS', 'department_VOA', 'colour_rating_amber/green',
       'colour_rating_amber/red', 'colour_rating_green', 'colour_rating_red',
       'colour_rating_reset', 'year_duration', 'start_yea

In [3]:
# Load the data
df = pd.read_pickle('../data/pickle/final_wo_errors.pkl')

In [4]:
print(df.shape)
df.columns

(1728, 59)


Index(['project_name', 'description_aims', 'rating_comment', 'start_date',
       'end_date', 'schedule_comment', 'yearly_budget', 'yearly_forecast',
       'wlc_baseline_incl_NCG', 'variance_comment', 'budget_comment', 'year',
       'report_category', 'project_number', 'total_benefits',
       'benefits_comment', 'department_CO', 'department_CPS', 'department_DBT',
       'department_DCMS', 'department_DEFRA', 'department_DEFRA & DFT',
       'department_DESNZ', 'department_DFE', 'department_DFID',
       'department_DFT', 'department_DHSC', 'department_DLUHC',
       'department_DSIT', 'department_DWP', 'department_FCDO',
       'department_HMLR', 'department_HMRC', 'department_HMT', 'department_HO',
       'department_MOD', 'department_MOJ', 'department_NCA', 'department_NS&I',
       'department_ONS', 'department_VOA', 'colour_rating_amber/green',
       'colour_rating_amber/red', 'colour_rating_green', 'colour_rating_red',
       'colour_rating_reset', 'year_duration', 'start_yea

### Back to the basic data frame

Remove Dummies

 Note that our dataset contains categorical features (dummies features) for the department and color rating features. To avoid any mismatch with faker generated data, or ending up with extra columns when merging. It is safer and cleaner to generate the raw categorical data first with Faker, then apply get_dummies afterward. This way, you ensure that the one-hot encoding step is consistent across your entire dataset, including both real and fake data.

 

In [5]:
# Remove get dummies for the department  columns 

# Identify the prefix of the dummy columns (in this case, 'department_')
prefix = 'department_' 

# Select dummy columns
dummy_columns = [col for col in df.columns if col.startswith(prefix)]

# Reconstruct the original categorical column
df['department'] = df[dummy_columns].idxmax(axis=1).str.replace(prefix, '')

# Drop the dummy columns
df= df.drop(columns=dummy_columns)

# Display the reconstructed DataFrame
#print(df.columns)

In [6]:
# Remove get dummies for the colour rating columns 

# Identify the prefix of the dummy columns (in this case, 'department_')
prefix_02 = 'colour_rating_' 

# Select dummy columns
dummy_columns_02 = [col for col in df.columns if col.startswith(prefix_02)]

# Reconstruct the original categorical column
df['colour_rating_'] = df[dummy_columns_02].idxmax(axis=1).str.replace(prefix_02, '')

# Drop the dummy columns
df = df.drop(columns=dummy_columns_02)

# Display the reconstructed DataFrame
#print(df.columns)
#print(df.shape)

Remove the macro data 

In [7]:
df_no_macro = df.drop(columns=['GDP', 'annual_earning_ft', 'unemployment_rate', 'youth_unemployment_rate', 'inflation_rate', 'population', 'gov_debt', 'tax_revenue', 'revenue_excl_grants', 'grants_and_other_revenue'])
#print(df_no_macro.shape)
#print(df_no_macro.columns)

In [8]:
df_macro = df[['GDP', 'annual_earning_ft', 'unemployment_rate', 'youth_unemployment_rate', 'inflation_rate', 'population', 'gov_debt', 'tax_revenue', 'revenue_excl_grants', 'grants_and_other_revenue','year']]

In [9]:
df_float_feat = df_no_macro[['start_date','end_date','yearly_budget', 'yearly_forecast', 'wlc_baseline_incl_NCG', 'year','year_duration','total_benefits', 'start_year', 'end_year']]
print(df_float_feat.shape)
df_float_feat.describe()

(1728, 10)


Unnamed: 0,start_date,end_date,yearly_budget,yearly_forecast,wlc_baseline_incl_NCG,year,year_duration,total_benefits,start_year,end_year
count,1581,1617,1590.0,1592.0,1522.0,1727.0,1501.0,343.0,1581.0,1617.0
mean,2013-06-27 19:25:50.664136704,2023-02-05 22:43:24.823747584,187.029507,173.421805,2431.478031,2018.756225,9.454024,7535.583294,2013.065149,2022.572665
min,1987-11-06 00:00:00,2011-10-18 00:00:00,0.0,0.0,0.0,2014.0,-0.08,5.0,1987.0,2011.0
25%,2011-02-28 00:00:00,2018-03-31 00:00:00,12.5125,11.0425,105.5375,2016.0,4.36,402.0,2011.0,2018.0
50%,2013-07-07 00:00:00,2022-05-31 00:00:00,45.135,39.572938,379.835,2019.0,6.93,2202.0,2013.0,2022.0
75%,2016-09-01 00:00:00,2025-04-30 00:00:00,152.525,132.2475,1499.615,2022.0,12.07,6418.0,2016.0,2025.0
max,2023-02-20 00:00:00,2075-12-31 00:00:00,12445.0,11988.15,55700.0,2023.0,67.5,70638.0,2023.0,2075.0
std,,,551.113927,524.737498,6000.761737,3.115428,7.842324,14184.648917,4.965469,7.037018


In [10]:
df_comment_feat = df_no_macro[['project_name', 'description_aims', 'rating_comment','schedule_comment','variance_comment', 'budget_comment',  'report_category', 'project_number','benefits_comment']]
#df_comment_feat.head()
df_comment_feat.shape # 9 features 

(1728, 9)

## 1. Faker: defining customized fake features  


Float/Money features 

In [11]:
class CustomFinancialProvider(BaseProvider):
    def yearly_budget(self):
        # Generate a random yearly budget in millions
        return round(random.uniform(1, 50), 2)  # Between 1 and 50 million

    def yearly_forecast(self):
        # Generate a random yearly forecast, similar to yearly budget but with slight variance
        return round(random.uniform(0.8, 1.2) * self.yearly_budget(), 2)

    def whole_life_cost(self, yearly_budget, duration):
        # Whole life cost is roughly yearly budget * duration (years)
        return round(yearly_budget * duration, 2)

    def total_benefits(self):
        # Either returns zero or a random benefit amount
        if random.random() < 0.3:  # 30% chance of no monetized benefits
            return 0
        else:
            return round(random.uniform(0.5, 100), 2)  # Random benefit between 0.5 and 100 million


Categorical features 

In [12]:
# Define a custom provider class
class CustomProjectProvider(BaseProvider):
    def department(self):
        # Define custom project categories
        return self.random_element(['CO', 'DBT', 'DCMS', 'DEFRA', 'DESNZ', 'DFE', 'DFT', 'DHSC',
       'DLUHC', 'DSIT', 'DWP', 'FCDO', 'HMLR', 'HMRC', 'HMT', 'HO', 'MOD',
       'MOJ', 'NCA', 'ONS', 'VOA', 'DEFRA & DFT', 'DFID', 'NS&I', 'CPS'])
    

In [13]:
class CustomRatingProvider(BaseProvider):
    def colour_rating_(self, yearly_budget, yearly_forecast):
        # Assign rating based on the comparison between budget and forecast
        if yearly_budget <= yearly_forecast:
            return 'green'
        elif yearly_budget <= 1.1 * yearly_forecast:
            return 'amber/green'
        elif yearly_budget <= 1.3 * yearly_forecast:
            return 'amber/red'
        else:
            return 'red'


Dates features


In [14]:
class CustomDateProvider(BaseProvider):
    def start_date(self):
        # Generates a start date within a 10-year range
        return fake.date_between(start_date='-10y', end_date='-1y')

    def end_date(self, start_date):
        # Generates an end date that's after the start date
        return fake.date_between(start_date=start_date, end_date='+10y')

    def year_from_date(self, date):
        # Extracts the year from a date
        return date.year

    def duration_years(self, start_date, end_date):
        # Calculates the duration in years between two dates
        return end_date.year - start_date.year

Text features :

In [15]:
class CustomTextProvider(BaseProvider):
    def project_name(self):
        # Generates a comment as a sentence
        return fake.sentence()
    
    def description_aims(self):
        # Generates a project description as a short paragraph
        return fake.paragraph(nb_sentences=3)
      
    def rating_comment(self):
        # Generates a comment as a sentence
        return fake.sentence()
    
    def schedule_comment(self):
        # Generates a comment as a sentence
        return fake.sentence()

    def variance_comment(self):
        # Generates a comment as a sentence
        return fake.sentence()
    
    def budget_comment(self):
        # Generates a comment as a sentence
        return fake.sentence()
 
    def report_category(self):
        # Generates a project description as a short paragraph
        return fake.paragraph(nb_sentences=3)

    def project_number(self):
        # Generates a comment as a sentence
        return fake.sentence()
    
    def benefits_comment(self):
        # Generates a comment as a sentence
        return fake.sentence()

## 2. Custom provider 



custom provider to the Faker instance to start using it:

In [16]:
# Initialize Faker
fake = Faker()

# Add the custom provider
fake.add_provider(CustomProjectProvider)
fake.add_provider(CustomFinancialProvider)
fake.add_provider(CustomDateProvider)
fake.add_provider(CustomTextProvider)
fake.add_provider(CustomRatingProvider)


#fake.add_provider(CustomEconomicProvider) this could be useful for the gpd or unemployment 

## 3. Generating the customized fake features  
 

In [17]:
# Number of rows to generate
num_rows = 1728  

data = []
for _ in range(num_rows):
    start_date = fake.start_date()
    end_date = fake.end_date(start_date=start_date)
    duration = fake.duration_years(start_date, end_date)
    yearly_budget = fake.yearly_budget()
    yearly_forecast = fake.yearly_forecast()
    whole_life_cost = fake.whole_life_cost(yearly_budget, duration)
    total_benefits = fake.total_benefits()
    colour_rating_ = fake. colour_rating_(yearly_budget, yearly_forecast)
    
    # Append the data
    data.append({
        'start_date': start_date,
        'end_date': end_date,
        'start_year': fake.year_from_date(start_date),
        'end_year': fake.year_from_date(end_date),
        'year_duration': duration, #5
        
        'yearly_budget': yearly_budget,
        'yearly_forecast': yearly_forecast,
        'wlc_baseline_incl_NCG': whole_life_cost,
        'total_benefits': total_benefits, #4

        'colour_rating': colour_rating_, 
        'department': fake.department(),# 2
        
        'project_name': fake.project_name(),
        'description_aims': fake.description_aims(),
        'rating_comment': fake.rating_comment(),
        'schedule_comment': fake.schedule_comment(),
        'variance_comment': fake.variance_comment(),
        'budget_comment': fake.budget_comment(),
        'report_category': fake.report_category(),
        'project_number': fake.project_number(),
        'benefits_comment': fake.benefits_comment()    #9  
    
        
    })

# Convert to DataFrame
df_with_faker = pd.DataFrame(data)



In [18]:
#df_with_faker
#print(df_with_faker.shape)
#print(df_with_faker.columns)
#df_with_faker.dtypes

# 4. Cleaning of df_with_faker 

1. start date , end date  : should be date time 


In [19]:
# Convert the 'start_date' and 'end_date' columns to datetime
df_with_faker['start_date'] = pd.to_datetime(df_with_faker['start_date'])
df_with_faker['end_date'] = pd.to_datetime(df_with_faker['end_date'])



2. colour rating  and department : should be categorical , then create dummies for that

In [20]:
# List of columns you know are categorical but were not detected
categorical_col = ['department', 'colour_rating'] 

# Convert these columns to category type
for col in categorical_col:
    df_with_faker[col] = df_with_faker[col].astype('category')

# to check the type after changes 
#df_with_faker.dtypes

dumm = pd.get_dummies(df_with_faker[['department','colour_rating']], drop_first=False)

# Concatenate the original dataframe with the dummy variables
df_faker_dumm = pd.concat([df_with_faker, dumm], axis=1)

# Drop original categorical columns
df_faker_dumm_01= df_faker_dumm.drop(['colour_rating', 'department'], axis=1)

In [21]:
#print(df_faker_dumm_01.shape)
#df_faker_dumm_01.columns

3. text column :  better  that they all contain `fake data` str type. in case of concat wit original we can differentiate the real from the fake 



In [22]:
# List of columns you know are categorical but were not detected
text_col = ['project_name','description_aims','rating_comment','schedule_comment','variance_comment','budget_comment','report_category','benefits_comment']  
# Convert these columns to category type
for col in text_col:
    df_faker_dumm_01[col] = "fake-generated data"

4. project_number :shouldn't be radom text.it should be unique

In [23]:
# Generate unique alphanumeric project IDs
df_faker_dumm_01['project_number'] = [f"Fake_PRJ{str(fake.unique.random_int(min=1000, max=9999))}" for _ in range(len(df_faker_dumm_01))]

In [24]:
#df_faker_dumm_01.head(5)

5. Organizing the table along the `year` feature

In [25]:
df_faker_dumm_01['start_year'].value_counts()

start_year
2020    208
2019    206
2016    204
2017    198
2015    196
2018    186
2022    179
2021    176
2023    122
2014     53
Name: count, dtype: int64

In [26]:
df_faker_dumm_01['year'] = df_faker_dumm_01['start_date'].dt.year

In [27]:
# Sort both DataFrames by 'year'
df_faker_dumm_01 = df_faker_dumm_01.sort_values(by='year').reset_index(drop=True)

**!!!** I saved this set of data generated with faker in a csv file . because everytime i run the notebook we have a new set of data. the data is randomly distributed throughout the years and the department etc ... 

In [28]:
df_faker_dumm_01.to_csv('fake_data_wo_macro.csv', index=False)

In [29]:
faked = pd.read_csv('fake_data_wo_macro.csv')

In [30]:
print(faked.shape)
print(faked.columns)



(1728, 48)
Index(['start_date', 'end_date', 'start_year', 'end_year', 'year_duration',
       'yearly_budget', 'yearly_forecast', 'wlc_baseline_incl_NCG',
       'total_benefits', 'project_name', 'description_aims', 'rating_comment',
       'schedule_comment', 'variance_comment', 'budget_comment',
       'report_category', 'project_number', 'benefits_comment',
       'department_CO', 'department_CPS', 'department_DBT', 'department_DCMS',
       'department_DEFRA', 'department_DEFRA & DFT', 'department_DESNZ',
       'department_DFE', 'department_DFID', 'department_DFT',
       'department_DHSC', 'department_DLUHC', 'department_DSIT',
       'department_DWP', 'department_FCDO', 'department_HMLR',
       'department_HMRC', 'department_HMT', 'department_HO', 'department_MOD',
       'department_MOJ', 'department_NCA', 'department_NS&I', 'department_ONS',
       'department_VOA', 'colour_rating_amber/green',
       'colour_rating_amber/red', 'colour_rating_green', 'colour_rating_red',
    

In [31]:
faked['project_number'].duplicated().value_counts()

project_number
False    1728
Name: count, dtype: int64

In [32]:
#faked.dtypes

`year` feature in `faked` is int, in macro and other real data is float . To unify the data frame, I changed it to int in faked data 

In [33]:
faked['year'] = faked['year'].astype(float)
#faked.dtypes

# 5. Merging faked with macro


In [34]:
df_macro = df_macro.sort_values(by='year').reset_index(drop=True)
#print(df_macro.shape)
#print(df_macro.columns)
#print(df_macro.dtypes)

In [35]:
df_faked_macro_merged = faked.merge(df_macro, on='year', how='left')

In [36]:
df_faked_macro_merged.shape

(287234, 58)

In [37]:
df_faked_macro_merged['project_number'].duplicated().value_counts()

project_number
True     285506
False      1728
Name: count, dtype: int64

In [38]:
# Keep only the unique entries based on 'project_number'
df_faked_macro_merged_unique = df_faked_macro_merged.drop_duplicates(subset=['project_number'], keep='first')


In [39]:
df_faked_macro_merged_unique.columns

Index(['start_date', 'end_date', 'start_year', 'end_year', 'year_duration',
       'yearly_budget', 'yearly_forecast', 'wlc_baseline_incl_NCG',
       'total_benefits', 'project_name', 'description_aims', 'rating_comment',
       'schedule_comment', 'variance_comment', 'budget_comment',
       'report_category', 'project_number', 'benefits_comment',
       'department_CO', 'department_CPS', 'department_DBT', 'department_DCMS',
       'department_DEFRA', 'department_DEFRA & DFT', 'department_DESNZ',
       'department_DFE', 'department_DFID', 'department_DFT',
       'department_DHSC', 'department_DLUHC', 'department_DSIT',
       'department_DWP', 'department_FCDO', 'department_HMLR',
       'department_HMRC', 'department_HMT', 'department_HO', 'department_MOD',
       'department_MOJ', 'department_NCA', 'department_NS&I', 'department_ONS',
       'department_VOA', 'colour_rating_amber/green',
       'colour_rating_amber/red', 'colour_rating_green', 'colour_rating_red',
       'year', 

In [40]:
df_fake_final = df_faked_macro_merged_unique

In [41]:
df_fake_final.to_csv('fake_final.csv', index=False)

In [42]:
df_fake_final.shape

(1728, 58)

**!!!**  Original has one more feature! it's `'colour_rating_reset' `categorical features. 


