In [71]:
import pandas as pd
import numpy as np

In [72]:
df_2023 = pd.read_csv('../data/raw_data/uk_2023.csv', encoding='iso-8859-2', delimiter=';')
df_2022 = pd.read_csv('../data/raw_data/uk_2022.csv', encoding='iso-8859-2', delimiter=';')
df_2021 = pd.read_csv('../data/raw_data/uk_2021.csv', encoding='iso-8859-2', delimiter=';')

In [73]:
result = pd.concat([df_2023, df_2022, df_2021], axis=0, ignore_index=True)

In [74]:
result['colour_rating_new'] = result['colour_rating'].combine_first(result['colour_rating.1'])


In [75]:
result = result.drop(columns=['colour_rating', 'colour_rating.1'])

# Cleaning text from columns

In [76]:
result['wlc_baseline_incl_NCG'] = result['wlc_baseline_incl_NCG'].str.replace('Ł', '', regex=False)
result['total_baseline'] = result['total_baseline'].str.replace('Ł', '', regex=False)
result['forecast_incl_NGC'] = result['forecast_incl_NGC'].str.replace('Ł', '', regex=False)
result['yearly_forecast'] = result['yearly_forecast'].str.replace('Ł', '', regex=False)
result['TOTAL Baseline Benefits (Łm)'] = result['TOTAL Baseline Benefits (Łm)'].str.replace('Ł', '', regex=False)

In [77]:
# Definiere eine Funktion, die den gesamten Prozess für eine Zelle übernimmt
def process_value(value):
    # Erster Schritt: Versuchen, in eine Zahl zu konvertieren
    try:
        float(value)
        return value
    except ValueError:
        pass
    
    # Zweiter Schritt: Ersetze ',' durch '' und versuche erneut, in eine Zahl zu konvertieren
    try:
        value = value.replace(',', '')
        float(value)
        return value
    except (ValueError, AttributeError):
        return np.nan

# Wende die Funktion auf mehrere Spalten gleichzeitig an
columns_to_process = ['total_baseline', 'forecast_incl_NGC', 'wlc_baseline_incl_NCG', 'TOTAL Baseline Benefits (Łm)', 'yearly_forecast']
result[columns_to_process] = result[columns_to_process].applymap(process_value)

# Zeige das Ergebnis an
print(result)


     year    project_number                               project_name   
0    2023   CO_0024_2021-Q2         Civil Service Pensions 2015 Remedy  \
1    2023   CO_0020_1718-Q4  Commercial Capability Expansion Programme   
2    2023   CO_0176_2223-Q3   Falcon IT Platform Refresh and Migration   
3    2023   CO_0027_2021-Q4                   Future Service Programme   
4    2023   CO_0033_2122-Q1                           GOV.UK One Login   
..    ...               ...                                        ...   
658  2021   HO_0042_2021-Q3                                   Cerberus   
659  2021   HO_0043_2021-Q4                  Future Suppliers Services   
660  2021   HO_0044_2021-Q4                        HMPO Transformation   
661  2021  BIS_0015_1516-Q1         Local Land Charges (LLC) Programme   
662  2021  HMT_0004_2021-Q2                               NS&I Rainbow   

    department                                 report_category   
0           CO  Government Transformation and

In [78]:
unique_texts = result[result['wlc_baseline_incl_NCG'].apply(is_text)]['wlc_baseline_incl_NCG'].unique()

print(unique_texts)

[]


# Further cleaning

In [79]:
result['start_date'] = pd.to_datetime(result['start_date'], errors='coerce')

  result['start_date'] = pd.to_datetime(result['start_date'], errors='coerce')


In [80]:
result['end_date'] = pd.to_datetime(result['end_date'], errors='coerce')

  result['end_date'] = pd.to_datetime(result['end_date'], errors='coerce')


In [81]:
result['forecast_incl_NGC'] = pd.to_numeric(result['forecast_incl_NGC'], errors='coerce')
result['wlc_baseline_incl_NCG'] = pd.to_numeric(result['wlc_baseline_incl_NCG'], errors='coerce')
result['yearly_forecast'] = pd.to_numeric(result['yearly_forecast'], errors='coerce')
result['TOTAL Baseline Benefits (Łm)'] = pd.to_numeric(result['TOTAL Baseline Benefits (Łm)'], errors='coerce')
result['total_baseline'] = pd.to_numeric(result['total_baseline'], errors='coerce')

In [82]:
allowed_values = ['Amber', 'Green', 'Red', 'Amber/red', 'Amber/Green']
result['colour_rating_new'] = result['colour_rating_new'].apply(lambda x: x if x in allowed_values else np.nan)

In [83]:
result['colour_rating_new'].unique()

array(['Amber', 'Green', 'Red', nan, 'Amber/red', 'Amber/Green'],
      dtype=object)

In [84]:
result = result.rename(columns={'colour_rating_new': 'colour_rating'})

In [85]:
result['yearly_budget'] = np.nan

In [86]:
result['yearly_forecast_new'] = result['yearly_forecast'].combine_first(result['forecast_incl_NGC'])

In [87]:
result.columns

Index(['year', 'project_number', 'project_name', 'department',
       'report_category', 'description_aims', 'rating_comment', 'start_date',
       'end_date', 'schedule_comment', 'total_baseline', 'forecast_incl_NGC',
       'variance_comment', 'wlc_baseline_incl_NCG', 'budget_comment',
       'TOTAL Baseline Benefits (Łm)', 'benefits_comment', 'yearly_forecast',
       'colour_rating', 'yearly_budget', 'yearly_forecast_new'],
      dtype='object')

In [88]:
result = result.drop(columns=['yearly_forecast', 'forecast_incl_NGC'])

In [89]:
result = result.rename(columns={'yearly_forecast_new': 'yearly_forecast'})

In [90]:
result.columns

Index(['year', 'project_number', 'project_name', 'department',
       'report_category', 'description_aims', 'rating_comment', 'start_date',
       'end_date', 'schedule_comment', 'total_baseline', 'variance_comment',
       'wlc_baseline_incl_NCG', 'budget_comment',
       'TOTAL Baseline Benefits (Łm)', 'benefits_comment', 'colour_rating',
       'yearly_budget', 'yearly_forecast'],
      dtype='object')

In [91]:
result = result.drop(columns=['yearly_budget'])

In [96]:
result = result.rename(columns={'total_baseline': 'yearly_budget'})
result = result.rename(columns={'TOTAL Baseline Benefits (Łm)': 'total_benefits'})

In [99]:
column_order = ['project_name','department','colour_rating','description_aims','rating_comment','start_date','end_date','schedule_comment','yearly_budget','yearly_forecast','wlc_baseline_incl_NCG','variance_comment','budget_comment','year','report_category','project_number','total_benefits','benefits_comment']

In [100]:
result = result[column_order]

In [101]:
result.columns

Index(['project_name', 'department', 'colour_rating', 'description_aims',
       'rating_comment', 'start_date', 'end_date', 'schedule_comment',
       'yearly_budget', 'yearly_forecast', 'wlc_baseline_incl_NCG',
       'variance_comment', 'budget_comment', 'year', 'report_category',
       'project_number', 'total_benefits', 'benefits_comment'],
      dtype='object')

In [102]:
result.to_csv('../data/raw_data/2021_2023.csv')

In [None]:
df_rauschgold = pd.read_csv('../data/raw_data/2021_2023.csv')
df_emna = pd.read_csv('df_2014_2017.csv')

In [None]:
result = pd.concat([df_rauschgold, df_emna], axis=0, ignore_index=True)

In [None]:
result.columns

In [None]:
result = result.drop(columns=['Unnamed: 0'])

In [None]:
result.to_csv('emna_franzi.csv')

In [None]:
df1 = pd.read_csv('../data/raw_data/2021_2023.csv')
df2 = pd.read_csv('../data/raw_data/df_2014_2017.csv')
df3= pd.read_csv('../data/raw_data/uk_2018-2020.csv')

In [None]:
result = pd.concat([df1, df2, df3], axis=0, ignore_index=True)

In [None]:
result = result.drop(columns=['Unnamed: 0'])

In [None]:
result.to_csv('dataset_EDA.csv')

In [None]:
df