# **Mounting Google Drive**


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Preprocessing for ARM**

In [None]:
import pandas as pd
import numpy as np

# Load Excel files
player_data = pd.read_excel("/content/drive/MyDrive/soccer_dataset/Player_Atrributes.xlsx", engine='openpyxl')
team_data = pd.read_excel("/content/drive/MyDrive/soccer_dataset/Team_Atrributes.xlsx", engine='openpyxl')

# Data Cleaning: Remove irrelevant columns
player_data.drop(columns=['player_fifa_api_id', 'player_api_id'], inplace=True, errors='ignore')
team_data.drop(columns=['team_fifa_api_id', 'team_api_id'], inplace=True, errors='ignore')

# Handle missing values
player_data.fillna(player_data.mean(numeric_only=True), inplace=True)
team_data.fillna(team_data.mean(numeric_only=True), inplace=True)
player_data.dropna(inplace=True)
team_data.dropna(inplace=True)

# Standardize date formats
player_data['date'] = pd.to_datetime(player_data['date'], errors='coerce')
team_data['date'] = pd.to_datetime(team_data['date'], errors='coerce')

# Filter Key Metrics
player_filtered = player_data[['date', 'overall_rating', 'potential', 'crossing',
                               'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']]
team_filtered = team_data[['date', 'buildUpPlaySpeed', 'buildUpPlayDribbling',
                           'buildUpPlayPassing', 'chanceCreationPassing']]

# Create Summaries
player_summary = player_filtered.pivot_table(index='preferred_foot',
                                             values='overall_rating',
                                             aggfunc='mean').reset_index()

team_summary = team_filtered.pivot_table(index='date',
                                         values=['buildUpPlaySpeed', 'buildUpPlayDribbling', 'buildUpPlayPassing'],
                                         aggfunc='mean').reset_index()

# Aggregate Data
player_filtered['rating_to_potential_ratio'] = player_filtered['overall_rating'] / player_filtered['potential']

# Sort data for visualization
player_filtered.sort_values('date', inplace=True)
team_filtered.sort_values('date', inplace=True)

# Save cleaned data to new files for visualization
player_filtered.to_csv("Cleaned_Player_Data.csv", index=False)
team_filtered.to_csv("Cleaned_Team_Data.csv", index=False)

print("Data preprocessing complete. Cleaned data saved to CSV files.")


In [None]:
import pandas as pd
import numpy as np

# Load Excel files
player_data = pd.read_excel("/content/drive/MyDrive/soccer_dataset/Player_Atrributes.xlsx", engine='openpyxl')
team_data = pd.read_excel("/content/drive/MyDrive/soccer_dataset/Team_Atrributes.xlsx", engine='openpyxl')


# Step 1: Data Cleaning
irrelevant_cols_player = ['player_fifa_api_id', 'player_api_id']
irrelevant_cols_team = ['team_fifa_api_id', 'team_api_id']
player_data.drop(columns=irrelevant_cols_player, inplace=True, errors='ignore')
team_data.drop(columns=irrelevant_cols_team, inplace=True, errors='ignore')

# Handle missing values
player_data.fillna(player_data.mean(numeric_only=True), inplace=True)
team_data.fillna(team_data.mean(numeric_only=True), inplace=True)
player_data.dropna(inplace=True)
team_data.dropna(inplace=True)

# Standardize date formats
player_data['date'] = pd.to_datetime(player_data['date'], errors='coerce')
team_data['date'] = pd.to_datetime(team_data['date'], errors='coerce')

# Step 2: Filter Key Metrics and create explicit copies
player_metrics = ['date', 'overall_rating', 'potential', 'crossing',
                  'preferred_foot', 'attacking_work_rate', 'defensive_work_rate']
team_metrics = ['date', 'buildUpPlaySpeed', 'buildUpPlayDribbling',
                'buildUpPlayPassing', 'chanceCreationPassing']

player_filtered = player_data[player_metrics].copy()
team_filtered = team_data[team_metrics].copy()

# Step 3: Create Summaries
player_summary = player_filtered.pivot_table(
    index='preferred_foot',
    values='overall_rating',
    aggfunc='mean'
).reset_index()

team_summary = team_filtered.pivot_table(
    index='date',
    values=['buildUpPlaySpeed', 'buildUpPlayDribbling', 'buildUpPlayPassing'],
    aggfunc='mean'
).reset_index()

# Step 4: Aggregate Data
player_filtered['rating_to_potential_ratio'] = player_filtered['overall_rating'] / player_filtered['potential']

# Step 5: Format Data for Graphing
player_filtered = player_filtered.sort_values('date')
team_filtered = team_filtered.sort_values('date')

# Save cleaned data to new files for visualization
player_filtered.to_csv("Cleaned_Player_Data.csv", index=False)
team_filtered.to_csv("Cleaned_Team_Data.csv", index=False)

print("Data preprocessing complete. Cleaned data saved to CSV files.")


  and should_run_async(code)


Data preprocessing complete. Cleaned data saved to CSV files.


# **Association Rule Mining for Soccer Team Attributes Using the Apriori Algorithm**



In [None]:
# Import necessary libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Specify the file path
file_path = '/content/drive/MyDrive/soccer_dataset/preprocessed/Player_Atrributes final data.xlsx'

# Attempt to load data with flexible encoding and format handling
try:
    # Try loading as a CSV file with error handling for malformed lines
    players = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
except Exception as e_csv:
    print(f"CSV load failed: {e_csv}. Trying to load as Excel.")
    try:
        players = pd.read_excel(file_path)
    except Exception as e_excel:
        print(f"Excel load failed: {e_excel}")
        raise ValueError("Failed to load the file. Please check the file format and path.")

# Display available columns to verify attributes
print("Available columns:", players.columns)

# Select relevant columns for analysis
attributes = ['overall_rating', 'potential', 'stamina', 'strength', 'agility']
available_attributes = [attr for attr in attributes if attr in players.columns]
df = players[available_attributes].dropna()

# Discretize continuous attributes into categorical bins
if 'overall_rating' in df.columns:
    df['overall_rating'] = pd.cut(df['overall_rating'], bins=[0, 60, 80, 100], labels=['Low', 'Medium', 'High'])
if 'potential' in df.columns:
    df['potential'] = pd.cut(df['potential'], bins=[0, 60, 80, 100], labels=['Low', 'Medium', 'High'])
if 'stamina' in df.columns:
    df['stamina'] = pd.cut(df['stamina'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])
if 'strength' in df.columns:
    df['strength'] = pd.cut(df['strength'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])
if 'agility' in df.columns:
    df['agility'] = pd.cut(df['agility'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])

# Convert categorical data to binary (one-hot encoding)
df_encoded = pd.get_dummies(df)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df_encoded, min_support=0.15, use_colnames=True)

# Generate association rules with a minimum confidence threshold
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Filter rules for effective player combinations based on high confidence and lift
effective_combinations = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.0)]

# Display effective player attribute combinations
print("Effective Player Combinations:")
print(effective_combinations[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

  and should_run_async(code)


CSV load failed: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
. Trying to load as Excel.
Available columns: Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')
Effective Player Combinations:
                                          antecedents  \
0                             (overall_rating_Medi

# **Association Rule Mining for Soccer Player Attributes Using the FP Growth Algorithm**



In [None]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Specify the file path
file_path = '/content/drive/MyDrive/soccer_dataset/preprocessed/Player_Atrributes final data.xlsx'

# Attempt to load data with flexible encoding and format handling
try:
    # Try loading as a CSV file with error handling for malformed lines
    players = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
except Exception as e_csv:
    print(f"CSV load failed: {e_csv}. Trying to load as Excel.")
    try:
        players = pd.read_excel(file_path)
    except Exception as e_excel:
        print(f"Excel load failed: {e_excel}")
        raise ValueError("Failed to load the file. Please check the file format and path.")

# Display available columns to verify attributes
print("Available columns:", players.columns)

# Select only available columns relevant for the analysis
attributes = ['potential', 'stamina', 'strength', 'agility']
available_attributes = [attr for attr in attributes if attr in players.columns]
df = players[available_attributes].dropna()

# Discretize continuous attributes into categorical bins
if 'potential' in df.columns:
    df['potential'] = pd.cut(df['potential'], bins=[0, 60, 80, 100], labels=['Low', 'Medium', 'High'])
if 'stamina' in df.columns:
    df['stamina'] = pd.cut(df['stamina'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])
if 'strength' in df.columns:
    df['strength'] = pd.cut(df['strength'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])
if 'agility' in df.columns:
    df['agility'] = pd.cut(df['agility'], bins=[0, 50, 70, 100], labels=['Low', 'Medium', 'High'])

# Convert categorical data to binary (one-hot encoding)
df_encoded = pd.get_dummies(df)

# Apply FP-Growth with a lower min_support threshold
frequent_itemsets = fpgrowth(df_encoded, min_support=0.05, use_colnames=True)

# Generate association rules with lower confidence and lift thresholds
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# Display all generated rules for inspection
print("All Generated Rules:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

# Filter rules with relaxed thresholds for confidence and lift
effective_combinations = rules[(rules['confidence'] > 0.5) & (rules['lift'] > 1.0)]
print("\nEffective Player Combinations with Lower Thresholds:")
print(effective_combinations[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


CSV load failed: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
. Trying to load as Excel.
Available columns: Index(['id', 'player_fifa_api_id', 'player_api_id', 'date', 'overall_rating',
       'potential', 'preferred_foot', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes'],
      dtype='object')
All Generated Rules:
                                          antecedents         consequents  \
0                                    (ag