In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Load the dataset
df = pd.read_csv('fifa_eda_stats.csv')

# Display basic info
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
display(df.head())

Dataset shape: (18207, 57)

First 5 rows:


Unnamed: 0,ID,Name,Age,Nationality,Overall,Potential,Club,Value,Wage,Preferred Foot,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,158023,L. Messi,31,Argentina,94,94,FC Barcelona,€110.5M,€565K,Left,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,20801,Cristiano Ronaldo,33,Portugal,94,94,Juventus,€77M,€405K,Right,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,190871,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,€118.5M,€290K,Right,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,193080,De Gea,27,Spain,91,93,Manchester United,€72M,€260K,Right,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,192985,K. De Bruyne,27,Belgium,91,92,Manchester City,€102M,€355K,Right,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [6]:
# Clean currency columns and convert to numeric
def clean_currency(value):
    if isinstance(value, str):
        value = value.replace('€', '').replace('M', 'e6').replace('K', 'e3')
        return float(value)
    return value

df['Value'] = df['Value'].apply(clean_currency)
df['Wage'] = df['Wage'].apply(clean_currency)
df['Release Clause'] = df['Release Clause'].apply(clean_currency)

# Clean height and weight columns
def clean_height(height):
    if isinstance(height, str):
        feet, inches = height.split("'")
        return int(feet) * 30.48 + int(inches.replace('"', '')) * 2.54
    return height

def clean_weight(weight):
    if isinstance(weight, str):
        return float(weight.replace('lbs', '')) * 0.453592
    return weight

df['Height'] = df['Height'].apply(clean_height)
df['Weight'] = df['Weight'].apply(clean_weight)

# Convert Joined to datetime
df['Joined'] = pd.to_datetime(df['Joined'], errors='coerce')

# Handle missing values
df.fillna({'Contract Valid Until': 2020, 'Loaned From': 'None'}, inplace=True)

# Create age groups
df['Age Group'] = pd.cut(df['Age'], 
                         bins=[15, 20, 25, 30, 35, 40, 50],
                         labels=['15-20', '20-25', '25-30', '30-35', '35-40', '40+'])

# Create value categories
df['Value Category'] = pd.cut(df['Value'], 
                             bins=[0, 10e6, 20e6, 50e6, 100e6, float('inf')],
                             labels=['<10M', '10-20M', '20-50M', '50-100M', '100M+'])

# Save cleaned data
df.to_csv('fifa_players_cleaned.csv', index=False)

In [8]:
# Basic statistics
print("Numerical columns statistics:")
display(df.describe())

# Categorical columns analysis
print("\nCategorical columns analysis:")
cat_cols = ['Nationality', 'Club', 'Preferred Foot', 'Position', 'Work Rate']
for col in cat_cols:
    print(f"\n{col} value counts:")
    display(df[col].value_counts().head(10))

# Correlation analysis
corr_matrix = df.select_dtypes(include=['int64', 'float64']).corr()
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Player Attributes')
plt.savefig('correlation_matrix.png', bbox_inches='tight')
plt.close()

Numerical columns statistics:


Unnamed: 0,ID,Age,Overall,Potential,Value,Wage,International Reputation,Weak Foot,Skill Moves,Jersey Number,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
count,18207.0,18207.0,18207.0,18207.0,18207.0,18207.0,18159.0,18159.0,18159.0,18147.0,...,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,18159.0,16643.0
mean,214298.338606,25.122206,66.238699,71.307299,2410696.0,9731.312133,1.113222,2.947299,2.361308,19.546096,...,58.648274,47.281623,47.697836,45.661435,16.616223,16.391596,16.232061,16.388898,16.710887,4585061.0
min,16.0,16.0,46.0,48.0,0.0,0.0,1.0,1.0,1.0,1.0,...,3.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,1.0,13000.0
25%,200315.5,21.0,62.0,67.0,300000.0,1000.0,1.0,3.0,2.0,8.0,...,51.0,30.0,27.0,24.0,8.0,8.0,8.0,8.0,8.0,525000.0
50%,221759.0,25.0,66.0,71.0,675000.0,3000.0,1.0,3.0,2.0,17.0,...,60.0,53.0,55.0,52.0,11.0,11.0,11.0,11.0,11.0,1100000.0
75%,236529.5,28.0,71.0,75.0,2000000.0,9000.0,1.0,3.0,3.0,26.0,...,67.0,64.0,66.0,64.0,14.0,14.0,14.0,14.0,14.0,3500000.0
max,246620.0,45.0,94.0,95.0,118500000.0,565000.0,5.0,5.0,5.0,99.0,...,96.0,94.0,93.0,91.0,90.0,92.0,91.0,90.0,94.0,228100000.0
std,29965.244204,4.669943,6.90893,6.136496,5594933.0,21999.290406,0.394031,0.660456,0.756164,15.947765,...,11.436133,19.904397,21.664004,21.289135,17.695349,16.9069,16.502864,17.034669,17.955119,11118720.0



Categorical columns analysis:

Nationality value counts:


Nationality
England        1662
Germany        1198
Spain          1072
Argentina       937
France          914
Brazil          827
Italy           702
Colombia        618
Japan           478
Netherlands     453
Name: count, dtype: int64


Club value counts:


Club
RC Celta                   33
Everton                    33
Valencia CF                33
Borussia Dortmund          33
TSG 1899 Hoffenheim        33
Wolverhampton Wanderers    33
AS Monaco                  33
Real Madrid                33
Atlético Madrid            33
Manchester United          33
Name: count, dtype: int64


Preferred Foot value counts:


Preferred Foot
Right    13948
Left      4211
Name: count, dtype: int64


Position value counts:


Position
ST     2152
GK     2025
CB     1778
CM     1394
LB     1322
RB     1291
RM     1124
LM     1095
CAM     958
CDM     948
Name: count, dtype: int64


Work Rate value counts:


Work Rate
Medium/ Medium    9810
High/ Medium      3173
Medium/ High      1690
High/ High        1015
Medium/ Low        850
High/ Low          699
Low/ Medium        449
Low/ High          439
Low/ Low            34
Name: count, dtype: int64

In [10]:
plt.figure(figsize=(14, 6))
sns.histplot(df['Value'] / 1e6, bins=30, kde=True)
plt.title('Distribution of Player Market Values (in millions €)')
plt.xlabel('Market Value (€ millions)')
plt.ylabel('Number of Players')
plt.savefig('value_distribution.png', bbox_inches='tight')
plt.close()

In [11]:
top_nations = df['Nationality'].value_counts().head(15)
plt.figure(figsize=(14, 6))
sns.barplot(x=top_nations.values, y=top_nations.index, palette='viridis')
plt.title('Top 15 Nationalities in FIFA Players Dataset')
plt.xlabel('Number of Players')
plt.ylabel('Nationality')
plt.savefig('top_nationalities.png', bbox_inches='tight')
plt.close()

In [12]:
plt.figure(figsize=(14, 8))
sns.scatterplot(x='Age', y='Potential', hue='Overall', size='Value', sizes=(20, 200), palette='viridis', data=df)
plt.title('Age vs Potential vs Overall Rating (Size by Value)')
plt.xlabel('Age')
plt.ylabel('Potential Rating')
plt.savefig('age_potential_overall.png', bbox_inches='tight')
plt.close()

In [13]:
position_stats = df.groupby('Position')[['Overall', 'Potential', 'Value', 'Wage']].mean()
position_stats['Player Count'] = df['Position'].value_counts()

plt.figure(figsize=(16, 8))
sns.heatmap(position_stats.drop('Player Count', axis=1).corr(), 
            annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Between Position Attributes')
plt.savefig('position_correlation.png', bbox_inches='tight')
plt.close()

In [14]:
def plot_top_players(attribute, title, num_players=10):
    top = df.nlargest(num_players, attribute)[['Name', 'Club', 'Nationality', attribute]]
    plt.figure(figsize=(12, 6))
    sns.barplot(x=attribute, y='Name', data=top, palette='viridis')
    plt.title(f'Top {num_players} Players by {title}')
    plt.xlabel(title)
    plt.ylabel('Player Name')
    plt.savefig(f'top_{attribute.lower()}.png', bbox_inches='tight')
    plt.close()
    return top

top_speed = plot_top_players('SprintSpeed', 'Sprint Speed')
top_stamina = plot_top_players('Stamina', 'Stamina')
top_passing = plot_top_players('ShortPassing', 'Short Passing')

In [17]:
# Select relevant features for clustering
features = ['Overall', 'Potential', 'Age', 'Value', 
            'Crossing', 'Finishing', 'ShortPassing', 'Dribbling', 
            'BallControl', 'Acceleration', 'SprintSpeed', 'Stamina', 
            'Strength', 'Aggression', 'Interceptions']

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[features].fillna(df[features].median()))

# Determine optimal number of clusters using elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal Cluster Number')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.savefig('elbow_method.png', bbox_inches='tight')
plt.close()

# Apply K-means clustering
optimal_clusters = 5
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', random_state=42)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Analyze clusters
cluster_analysis = df.groupby('Cluster')[features].mean()
cluster_sizes = df['Cluster'].value_counts()

plt.figure(figsize=(14, 8))
sns.scatterplot(x='Overall', y='Value', hue='Cluster', 
                palette='viridis', size='Potential', sizes=(20, 200), data=df)
plt.title('Player Clusters by Overall Rating and Value')
plt.xlabel('Overall Rating')
plt.ylabel('Market Value (€)')
plt.savefig('player_clusters.png', bbox_inches='tight')
plt.close()

In [18]:
# Top clubs by average player rating
club_stats = df.groupby('Club').agg({
    'Overall': 'mean',
    'Value': 'mean',
    'Wage': 'mean',
    'Name': 'count'
}).rename(columns={'Name': 'PlayerCount'}).sort_values('Overall', ascending=False)

top_clubs = club_stats.head(20)

plt.figure(figsize=(16, 8))
sns.scatterplot(x='Overall', y='Value', size='PlayerCount', 
                hue=top_clubs.index, palette='tab20', data=top_clubs, sizes=(50, 300))
plt.title('Top Clubs by Average Player Rating and Value')
plt.xlabel('Average Overall Rating')
plt.ylabel('Average Player Value (€)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.savefig('top_clubs.png', bbox_inches='tight')
plt.close()

In [20]:
# Age distribution by position
plt.figure(figsize=(16, 8))
sns.boxplot(x='Position', y='Age', data=df, palette='viridis')
plt.title('Age Distribution by Player Position')
plt.xticks(rotation=45)
plt.savefig('age_by_position.png', bbox_inches='tight')
plt.close()

# Potential vs Age
plt.figure(figsize=(14, 8))
sns.lineplot(x='Age', y='Potential', data=df, ci=None, label='Potential')
sns.lineplot(x='Age', y='Overall', data=df, ci=None, label='Overall')
plt.title('Player Potential and Overall Rating by Age')
plt.xlabel('Age')
plt.ylabel('Rating')
plt.legend()
plt.savefig('rating_by_age.png', bbox_inches='tight')
plt.close()

In [31]:
pip install fpdf

Collecting fpdfNote: you may need to restart the kernel to use updated packages.

  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40769 sha256=2238f8990bdcae3236cbd80d5ec235d75109d0bff12aab6e1f27e9bca63fe610
  Stored in directory: c:\users\sarum\appdata\local\pip\cache\wheels\6e\62\11\dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
from fpdf import FPDF

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'FIFA Player Performance Analysis Report', 0, 1, 'C')
    
    def chapter_title(self, title):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1, 'L')
        self.ln(4)
# Create PDF report
pdf = PDF()

# Enable Unicode support (add a Unicode font)
pdf.add_font('Arial', 'B', 'arialbd.ttf', uni=True)
pdf.add_font('Arial', 'I', 'ariali.ttf', uni=True)

pdf.add_page()

# Add title and introduction
pdf.set_font('Arial', 'B', 16)
pdf.cell(0, 10, 'FIFA Player Performance Analysis', 0, 1, 'C')
pdf.ln(10)

# Key findings - replace Euro symbol with 'EUR' if needed
pdf.chapter_title('Key Findings')
findings = """
1. The dataset contains information on {} players with detailed attributes.
2. The most represented nationalities are Spain, Germany, and Argentina.
3. Player values follow a right-skewed distribution with most players valued under EUR 20M.
4. There are 5 distinct player clusters based on performance attributes:
   - Cluster 0: Well-rounded players with high overall ratings
   - Cluster 1: Young players with high potential
   - Cluster 2: Veteran players with moderate ratings
   - Cluster 3: Physical players with high strength and stamina
   - Cluster 4: Technical players with high ball control and dribbling
5. Player potential peaks around age 20-22 and gradually declines.
6. Top clubs like FC Barcelona and Real Madrid have both high average ratings and values.
""".format(len(df))
pdf.chapter_body(findings)

# Add visualizations to report
visuals = [
    ('value_distribution.png', 'Player Value Distribution'),
    ('top_nationalities.png', 'Top Nationalities Representation'),
    ('age_potential_overall.png', 'Age vs Potential vs Overall Rating'),
    ('player_clusters.png', 'Player Clusters by Overall and Value'),
    ('top_clubs.png', 'Top Clubs by Average Rating and Value')
]

for img_path, caption in visuals:
    pdf.add_page()
    pdf.chapter_title(caption)
    pdf.image(img_path, x=10, w=190)
    pdf.ln(5)
    pdf.set_font('Arial', 'I', 10)
    pdf.cell(0, 10, caption, 0, 1, 'C')

# Save the report
pdf.output('fifa_analysis_report.pdf')

RuntimeError: TTF Font file not found: arialbd.ttf

In [36]:
pip install python-pptx

Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.5-py3-none-any.whl.metadata (2.7 kB)
Collecting lxml>=3.1.0 (from python-pptx)
  Downloading lxml-6.0.0-cp312-cp312-win_amd64.whl.metadata (6.8 kB)
Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
Downloading lxml-6.0.0-cp312-cp312-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ------- -------------------------------- 0.8/4.0 MB 4.2 MB/s eta 0:00:01
   ------------- -------------------------- 1.3/4.0 MB 3.4 MB/s eta 0:00:01
   ------------------ --------------------- 1.8/4.0 MB 2.9 MB/s eta 0:00:01
   ----------------------- ---------------- 2.4/4.0 MB 3.1 MB/s eta 0:00:01
   ------------------------------- -------- 3.1/4.0 MB 3.1 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 3.3 MB/s eta 0:00:00
Downloading xlsxwriter-3.2.5-py3-none-any.whl 


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
from pptx import Presentation
from pptx.util import Inches

# Create presentation
prs = Presentation()

# Title slide
slide = prs.slides.add_slide(prs.slide_layouts[0])
title = slide.shapes.title
subtitle = slide.placeholders[1]
title.text = "FIFA Player Performance Analysis"
subtitle.text = "Data-Driven Insights for Player Evaluation"

# Overview slide
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Project Overview"
content.text = """
- Analyzed dataset of {} players
- Examined player attributes, values, and performance metrics
- Identified key patterns and clusters
- Developed actionable insights for player evaluation
""".format(len(df))

# Key findings slide
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Key Findings"
content.text = """
1. Player values are highly right-skewed
2. 5 distinct player clusters identified
3. Potential peaks at age 20-22
4. Top clubs dominate in both quality and value
5. Different positions have distinct attribute profiles
"""

# Add visual slides
visuals = [
    ('value_distribution.png', 'Player Value Distribution'),
    ('top_nationalities.png', 'Top Nationalities'),
    ('age_potential_overall.png', 'Age vs Potential Analysis'),
    ('player_clusters.png', 'Player Clusters'),
    ('top_clubs.png', 'Top Clubs Analysis')
]

for img_path, title_text in visuals:
    slide = prs.slides.add_slide(prs.slide_layouts[1])
    title = slide.shapes.title
    title.text = title_text
    slide.shapes.add_picture(img_path, Inches(1), Inches(1.5), width=Inches(8), height=Inches(5))

# Recommendations slide
slide = prs.slides.add_slide(prs.slide_layouts[1])
title = slide.shapes.title
content = slide.placeholders[1]
title.text = "Recommendations"
content.text = """
1. Focus scouting on players aged 20-25 with high potential
2. Consider cluster-specific training programs
3. Value for money can be found in certain nationality markets
4. Balance squad with both technical and physical players
5. Monitor player development curves carefully
"""

# Save presentation
prs.save('fifa_analysis_presentation.pptx')

In [38]:
pip install fpdf2

Collecting fpdf2
  Downloading fpdf2-2.8.3-py2.py3-none-any.whl.metadata (69 kB)
Collecting defusedxml (from fpdf2)
  Downloading defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Downloading fpdf2-2.8.3-py2.py3-none-any.whl (245 kB)
Downloading defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)
Installing collected packages: defusedxml, fpdf2
Successfully installed defusedxml-0.7.1 fpdf2-2.8.3
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [43]:
from fpdf import FPDF

class PDF(FPDF):
    def header(self):
        self.set_font('DejaVu', 'B', 12)
        self.cell(0, 10, 'FIFA Player Performance Analysis Report', 0, 1, 'C')
    
    # ... rest of the class definition ...

pdf = PDF()
pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
pdf.add_font('DejaVu', 'B', 'DejaVuSans-Bold.ttf', uni=True)
pdf.add_font('DejaVu', 'I', 'DejaVuSans-Oblique.ttf', uni=True)

RuntimeError: TTF Font file not found: DejaVuSans.ttf