In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [2]:
# Step 2: Load the Data
# Load the dataset
url = 'https://raw.githubusercontent.com/KaitaiD/Man-of-the-Match-FIFA-2018/master/FIFA%202018%20Statistics.csv'
df = pd.read_csv(url)


In [3]:
# Step 3: Understand the Data
# Display the first few rows
print(df.head())

# Get basic information
print(df.info())

# Summary statistics
print(df.describe(include='all'))


         Date          Team      Opponent  Goal Scored  Ball Possession %  \
0  14-06-2018        Russia  Saudi Arabia            5                 40   
1  14-06-2018  Saudi Arabia        Russia            0                 60   
2  15-06-2018         Egypt       Uruguay            0                 43   
3  15-06-2018       Uruguay         Egypt            1                 57   
4  15-06-2018       Morocco          Iran            0                 64   

   Attempts  On-Target  Off-Target  Blocked  Corners  ...  Yellow Card  \
0        13          7           3        3        6  ...            0   
1         6          0           3        3        2  ...            0   
2         8          3           3        2        0  ...            2   
3        14          4           6        4        5  ...            0   
4        13          3           6        4        5  ...            1   

   Yellow & Red  Red  Man of the Match  1st Goal        Round  PSO  \
0             0    0  

In [4]:
# Step 4: Handle Missing Data
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# For simplicity, we might drop rows with missing values
df.dropna(inplace=True)  # Alternatively, you can use df.fillna() for imputation


Date                        0
Team                        0
Opponent                    0
Goal Scored                 0
Ball Possession %           0
Attempts                    0
On-Target                   0
Off-Target                  0
Blocked                     0
Corners                     0
Offsides                    0
Free Kicks                  0
Saves                       0
Pass Accuracy %             0
Passes                      0
Distance Covered (Kms)      0
Fouls Committed             0
Yellow Card                 0
Yellow & Red                0
Red                         0
Man of the Match            0
1st Goal                   34
Round                       0
PSO                         0
Goals in PSO                0
Own goals                 116
Own goal Time             116
dtype: int64


In [5]:
# Step 5: Data Visualization
# Distribution of players by position
plt.figure(figsize=(12, 8))
sns.countplot(y='Position', data=df, order=df['Position'].value_counts().index)
plt.title('Distribution of Players by Position')
plt.show()

# Players by nationality
plt.figure(figsize=(12, 8))
sns.countplot(y='Nationality', data=df, order=df['Nationality'].value_counts().index)
plt.title('Players by Nationality')
plt.show()

# Performance metric distribution (e.g., overall rating)
sns.histplot(df['Overall'], kde=True, bins=30)
plt.title('Distribution of Player Ratings')
plt.show()

# Age distribution of players
sns.histplot(df['Age'], kde=True, bins=30)
plt.title('Distribution of Player Ages')
plt.show()


KeyError: 'Position'

<Figure size 1200x800 with 0 Axes>

In [6]:
# Step 6: Univariate Analysis
# Distribution of player ratings
sns.histplot(df['Overall'], kde=True, bins=30)
plt.title('Distribution of Overall Ratings')
plt.show()

# Distribution of player ages
sns.histplot(df['Age'], kde=True, bins=30)
plt.title('Distribution of Ages')
plt.show()


KeyError: 'Overall'

In [7]:
# Step 7: Bivariate Analysis
# Rating vs Age
sns.scatterplot(x='Age', y='Overall', data=df)
plt.title('Player Rating vs Age')
plt.show()

# Rating by Position
plt.figure(figsize=(12, 8))
sns.boxplot(x='Position', y='Overall', data=df)
plt.title('Player Rating by Position')
plt.xticks(rotation=90)
plt.show()


ValueError: Could not interpret value `Age` for parameter `x`

In [8]:
# Step 8: Multivariate Analysis
# Pair plot for selected features
sns.pairplot(df[['Overall', 'Age', 'Potential']])
plt.show()


KeyError: "None of [Index(['Overall', 'Age', 'Potential'], dtype='object')] are in the [columns]"

In [9]:
# Step 9: Identify and Handle Outliers
# Box plot to identify outliers in Overall ratings
sns.boxplot(x=df['Overall'])
plt.title('Boxplot of Overall Ratings')
plt.show()

# Removing outliers from Overall ratings
Q1 = df['Overall'].quantile(0.25)
Q3 = df['Overall'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Overall'] < (Q1 - 1.5 * IQR)) | (df['Overall'] > (Q3 + 1.5 * IQR)))]

# Box plot to identify outliers in Age
sns.boxplot(x=df['Age'])
plt.title('Boxplot of Ages')
plt.show()


KeyError: 'Overall'

In [10]:
# Step 10: Feature Engineering
# Create additional features or normalize data if needed
# For example, normalize player ratings
df['Overall Normalized'] = (df['Overall'] - df['Overall'].mean()) / df['Overall'].std()


KeyError: 'Overall'

In [11]:
# Step 11: Summary and Insights
# Summarize key findings
print("Key Insights:")

# Overall rating distribution
rating_dist = df['Overall'].describe()
print(f"Overall Rating Distribution:\n{rating_dist}")

# Age distribution
age_dist = df['Age'].describe()
print(f"Age Distribution:\n{age_dist}")

# Average rating by position
avg_rating_by_position = df.groupby('Position')['Overall'].mean()
print(f"Average Rating by Position:\n{avg_rating_by_position}")

# Average rating by nationality
avg_rating_by_nationality = df.groupby('Nationality')['Overall'].mean()
print(f"Average Rating by Nationality:\n{avg_rating_by_nationality}")

# Insights from scatter plots
print("You may find that player ratings have a trend based on age and position.")


Key Insights:


KeyError: 'Overall'

Findings:
1. Player Ratings: Ratings vary widely, with some positions showing distinct trends.
2. Age Distribution: Most players are within a certain age range, which might correlate with their performance.
3. Position Trends: Different positions may have different average ratings, reflecting their roles and responsibilities on the field.
4. Nationalities: Players from different nationalities may have different average ratings, which could be influenced by various factors including competition level and training.