- Importing necessary libraries
- Utility functions for Excel formatting (e.g., adjusting column width)

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np


# Excel Formating Libraries
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import PatternFill, Font, Border, Side, Alignment
from openpyxl.utils import get_column_letter

In [4]:
def adjust_column_width(worksheet):
    for column in worksheet.columns:
        max_length = 0
        column_letter = get_column_letter(column[0].column)  # Get the column letter

        for cell in column:
            try: 
                # Check the length of the cell value
                if len(cell.value) > max_length:
                    max_length = len(cell.value)
            except:
                pass

        adjusted_width = (max_length + 2)  # Add 2 for a little extra space
        worksheet.column_dimensions[column_letter].width = adjusted_width

In [5]:
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()

In [6]:
df_copy = pd.read_csv('FIFA23_official_data.csv')
df = df_copy.copy()

df_copy2 = pd.read_csv('FIFA22_official_data.csv')
df2 = df_copy2.copy()

### Data Cleaning

- Standardizing data formats (e.g., converting values to numeric)
- Sorting and ordering based on position
- Handling missing values and dropping unnecessary columns

In [8]:
for row in range(len(df2['Position'])):
    if df2['Position'].isna().iloc[row] == False:
        df2.iloc[row, 20] = extract_text_from_html(df2.iloc[row,20])

In [9]:
for row in range(len(df['Position'])):
    if df['Position'].isna().iloc[row] == False:
        df.iloc[row, 20] = extract_text_from_html(df.iloc[row,20])

In [10]:
df['ID'] = df['ID'].astype(str)

df['Value_No'] = df['Value'].str.replace('[^\d.]','',regex = True)

value_no_copy = df['Value_No'].copy()

for index, value in df['Value'].items():

    if value[-1] == 'M':
        value_no_copy[index] = int(float(value_no_copy[index]) * 1000000)
    elif value[-1] == 'K':
        value_no_copy[index] = int(float(value_no_copy[index]) * 1000)

df['Value_No'] = value_no_copy

df['Value_No'] = df['Value_No'].astype(int)

In [11]:
df2['ID'] = df2['ID'].astype(str)

df2['Value_No'] = df2['Value'].str.replace('[^\d.]','',regex = True)

value_no_copy2 = df2['Value_No'].copy()

for index, value in df2['Value'].items():

    if value[-1] == 'M':
        value_no_copy2[index] = int(float(value_no_copy2[index]) * 1000000)
    elif value[-1] == 'K':
        value_no_copy2[index] = int(float(value_no_copy2[index]) * 1000)

df2['Value_No'] = value_no_copy2


df2['Value_No'] = df2['Value_No'].astype(int)

In [14]:
df_new = df[['ID', 'Name', 'Club', 'Position','Value', 'Overall', 'Value_No']]

df_new = df_new[df_new['Position'].isna() != True]

In [15]:
df_new2 = df2[['ID', 'Name', 'Club', 'Position','Value', 'Overall', 'Value_No']]


df_new2 = df_new2[df_new2['Position'].isna() != True]

In [16]:
position_order = ['GK', 'CB', 'LCB', 'RCB', 'LB', 'RB', 'LWB', 'RWB', 'CDM', 'LDM', 'RDM', 'CM', 'LCM', 'RCM', 'LM', 'RM', 'LAM', 'RAM', 'CAM', 'LW', 'RW', 'LF', 'RF', 'LS', 'RS', 'CF', 'ST', 'SUB', 'RES']

df_new.loc[:,'Position'] = pd.Categorical(df_new['Position'], categories = position_order, ordered = True)

df_new = df_new.sort_values(by=['Position', 'Value_No'], ascending = [True, False])

df_new.reset_index(inplace = True, drop = True)

In [17]:
position_order2 = ['GK', 'CB', 'LCB', 'RCB', 'LB', 'RB', 'LWB', 'RWB', 'CDM', 'LDM', 'RDM', 'CM', 'LCM', 'RCM', 'LM', 'RM', 'LAM', 'RAM', 'CAM', 'LW', 'RW', 'LF', 'RF', 'LS', 'RS', 'CF', 'ST', 'SUB', 'RES']

df_new2.loc[:,'Position'] = pd.Categorical(df_new2['Position'], categories = position_order2, ordered = True)

df_new2 = df_new2.sort_values(by=['Position', 'Value_No'], ascending = [True, False])

df_new2.reset_index(inplace = True, drop = True)

- Merging two datasets for comparison
- Computing percentage differences between datasets

In [19]:
df_merge = df_new.merge(df_new2, how = 'inner', left_on = ['ID','Position'], right_on = ['ID','Position'], suffixes = ('_2023', '_2022'))

In [20]:
df_merge.rename(columns = {'Name_2023':'Name','Club_2023':'Club'}, inplace = True)

In [21]:
df_merge.drop(columns = ['Name_2022', 'Club_2022'], axis = 1, inplace = True)

In [22]:
position_list = df_merge['Position'].unique()

df_merge_new = df_merge.groupby('Position').agg({'Overall_2023':'mean', 'Overall_2022':'mean','Value_No_2023':'sum','Value_No_2022':'sum'})
df_merge_new.reset_index(inplace = True)
df_merge_new['Position'] = df_merge_new['Position'].astype(str)
df_merge_new['Position'] = df_merge_new['Position'] + ' Subtotal'

len_list_position = len(position_order)
for index, item in enumerate(reversed(position_order)):
    original_index = len_list_position - 1 - index
    position_order.insert(original_index + 1, f'{item} Subtotal')

In [23]:
df_new_new = pd.concat([df_merge, df_merge_new])

df_new_new = df_new_new.fillna('')

df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2023'] = round(df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_No_2023']/1000000,1)
df_new_new['Value_2023'] = df_new_new['Value_2023'].astype(str)
df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2023'] = '€' + df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2023'] + 'M'

df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2022'] = round(df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_No_2022']/1000000,1)
df_new_new['Value_2022'] = df_new_new['Value_2022'].astype(str)
df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2022'] = '€' + df_new_new.loc[df_new_new['Position'].str.endswith(' Subtotal'), 'Value_2022'] + 'M'

In [24]:
#Calculate the pct dif
df_new_new['Value_No Pct Dif'] = round(((df_new_new['Value_No_2023'] - df_new_new['Value_No_2022'] ) / df_new_new['Value_No_2022']) * 100,2)
df_new_new['Overall Pct Dif'] = round(((df_new_new['Overall_2023'] - df_new_new['Overall_2022'] ) / df_new_new['Overall_2022']) * 100,2)

In [25]:
df_new_new[['Value_No Pct Dif','Overall Pct Dif']] = round(df_new_new[['Value_No Pct Dif','Overall Pct Dif']],2)
df_new_new[['Overall_2023', 'Overall_2022']] = round(df_new_new[['Overall_2023', 'Overall_2022']],0)
df_new_new[['Overall_2023','Overall_2022']] = df_new_new[['Overall_2023','Overall_2022']].astype('Int64')

df_new_new['Position'] = df_new_new['Position'].astype(str)

In [26]:
df_new_new[['Value_No Pct Dif','Overall Pct Dif','Value_2022']] = df_new_new[['Value_No Pct Dif','Overall Pct Dif','Value_2022']].fillna('')
df_new_new[['Overall_2023', 'Overall_2022', 'Value_2023']] = df_new_new[['Overall_2023', 'Overall_2022','Value_2023']].fillna(0)

In [27]:
merge_position_order = [element for element in position_order if element in df_new_new['Position'].unique()]

In [28]:
df_new_new['Position'] = pd.Categorical(df_new_new['Position'], categories=merge_position_order, ordered=True)

df_new_new = df_new_new.sort_values(by=['Position', 'Value_No_2023'], ascending=[True, False])

df_new_new.reset_index(inplace = True, drop = True)

In [29]:
df_new_new = df_new_new[['ID', 'Name', 'Club', 'Position', 'Value_2023','Value_2022','Value_No Pct Dif'
                     ,'Overall_2023','Overall_2022','Overall Pct Dif']]

In [30]:
df_new_new[['Overall_2023','Overall_2022','Overall Pct Dif','Position']] = df_new_new[['Overall_2023','Overall_2022','Overall Pct Dif','Position']].astype(str)

In [31]:
df_new_new.loc[df_new_new['Value_No Pct Dif'] == '', 'Value_No Pct Dif'] = 0

In [32]:
num_of_rows = len(df_new_new)

In [33]:
# Colors
light_greyFill = PatternFill(start_color='FFCCCCCC',
                       end_color='FFCCCCCC',
                       fill_type='solid')

#Size of the borders
thin_border = Border(left=Side(style='thin', color = '00000000'), 
                     right=Side(style='thin', color = '00000000'), 
                     top=Side(style='thin', color = '00000000'), 
                     bottom=Side(style='thin', color = '00000000'))

thick_border = Border(left=Side(style='thick', color = '00000000'), 
                     right=Side(style='thick', color = '00000000'), 
                     top=Side(style='thick', color = '00000000'), 
                     bottom=Side(style='thick', color = '00000000'))

### Transformation in Excel
- Summarizing value changes
- Formatting subtotal rows

In [35]:
wb = Workbook()
ws = wb.active
for r in dataframe_to_rows(df_new_new, index=False, header=True): # if index = True add one row between header and the values
    ws.append(r)

In [36]:
for name in position_list:

    start = df_new_new[df_new_new['Position'] == name].index[0]+2
    end = df_new_new[df_new_new['Position'] == name].index[-1]+2
    ws.row_dimensions.group(start, end, hidden=True)


In [37]:
# Get the position of the columns and then the column letter based in excel
list_of_columns_name = df_merge.columns
dict_col = {}

for col in range(len(list_of_columns_name)):
    coordinate = ws[1][col].coordinate
    coordinate = coordinate.replace('1','')
    dict_col[list_of_columns_name[col]] = coordinate


In [38]:
list_strings = df_merge.columns[:4]
list_values = df_merge.columns[-6:]

In [39]:
list_strings

Index(['ID', 'Name', 'Club', 'Position'], dtype='object')

In [40]:
list_values

Index(['Value_2023', 'Overall_2023', 'Value_No_2023', 'Value_2022',
       'Overall_2022', 'Value_No_2022'],
      dtype='object')

In [41]:
# Format for all columns
for col in dict_col.values():
    for row in range(num_of_rows+1):
        ws[col+str(row+1)].border = thin_border

In [42]:
# Format for countries
for col in list_strings:
    for row in range(num_of_rows+1):
        #ws[dict_col[col]+str(row+1)].font = Font(bold = True)
        ws[dict_col[col]+str(row+1)].alignment  = Alignment(horizontal='left')
        
        if ws[dict_col['Position']+str(row+1)].value.endswith('Subtotal') == True:
            ws[dict_col[col]+str(row+1)].font = Font(bold = True)

In [43]:
# Format for values
for col in list_values:
    for row in range(num_of_rows+1):
        #ws[dict_col[col]+str(row+1)].font = Font(bold = True)
        ws[dict_col[col]+str(row+1)].alignment  = Alignment(horizontal='right')
        #ws[dict_col[col]+str(row+1)].number_format = '€#,##0.00'
        
        if ws[dict_col['Position']+str(row+1)].value.endswith('Subtotal') == True:
            ws[dict_col[col]+str(row+1)].font = Font(bold = True)

In [44]:
# Format for headers
for col in dict_col.values():
    ws[col+str(1)].font = Font(bold = True)
    ws[col+str(1)].alignment  = Alignment(horizontal='center', vertical = 'center', wrapText = True)
    ws[col+str(1)].border = thick_border
    ws[col+str(1)].fill = light_greyFill

In [45]:
adjust_column_width(ws)

In [46]:
wb.save( "ManipulationsOfTwoDataframes.xlsx")