### ðŸ”¹  Cell 1


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
# === End of Refactored Cell ===

### ðŸ”¹  Cell 2


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
sns.set_style("whitegrid")
pd.set_option('display.max_columns', None)
print("--- PROJECT START ---")
# === End of Refactored Cell ===

### ðŸ”¹ cell3


######1: Data Merging and Initial Inspection



### ðŸ”¹  Cell 4


###### 1. Load Datasets



### ðŸ”¹  Cell 5


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
import pandas as pd

try:
    # Player-related raw_data
    injuries_df = pd.read_csv("player_injuries.csv")
    latest_market_value_df = pd.read_csv("player_latest_market_value.csv")
    market_value_df = pd.read_csv("player_market_value.csv")
    national_perf_df = pd.read_csv("player_national_performances.csv")
    performances_df = pd.read_csv("player_performances.csv") # Corrected filename
    profiles_df = pd.read_csv("player_profiles.csv", low_memory=False) # low_memory=False to handle mixed types
    teammates_df = pd.read_csv("player_teammates_played_with.csv")

    # Team-related raw_data
    team_children_df = pd.read_csv("team_children.csv")
    team_comp_season_df = pd.read_csv("team_competitions_seasons.csv")
    team_details_df = pd.read_csv("team_details.csv")

    # Transfer raw_data
    transfer_history_df = pd.read_csv("transfer_history.csv")

    # Sentiment / tweets raw_data
    tweets_df = pd.read_csv("tweets_premier_league_footballers.csv", encoding='latin1') # Corrected filename and encoding

except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure all files are uploaded correctly.")
    raise
# === End of Refactored Cell ===

### ðŸ”¹ Cell 6


#####2. Merge Datasets

> 

### ðŸ”¹ Cell 7


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Ensure 'player_id' columns are of consistent type before merging
for df_name in [profiles_df, latest_market_value_df, market_value_df, performances_df, national_perf_df, injuries_df, tweets_df]:
    if 'player_id' in df_name.columns:
        df_name['player_id'] = pd.to_numeric(df_name['player_id'], errors='coerce').astype('Int64') # Use Int64 for nullable integer

# Start merging using player profiles as the base
merged_df = profiles_df.copy()
print(f"Initial profiles_df shape: {merged_df.shape}")

# Standardize player names in profiles_df to match tweets_df format for merging
profiles_df['clean_player_name'] = profiles_df['player_name'].str.split('(', n=1).str[0].str.strip().str.upper()

# Merge market value (latest + historical)
merged_df = merged_df.merge(
    latest_market_value_df.drop(columns=['player_name'], errors='ignore'),
    on='player_id',
    how='inner'
)
print(f"After latest_market_value_df merge: {merged_df.shape}")

merged_df = merged_df.merge(
    market_value_df.drop(columns=['player_name'], errors='ignore'),
    on='player_id',
    how='inner'
)
print(f"After market_value_df merge: {merged_df.shape}")

# Merge performances
merged_df = merged_df.merge(
    performances_df,
    on='player_id',
    how='inner'
)
print(f"After performances_df merge: {merged_df.shape}")

# Merge national performances
merged_df = merged_df.merge(
    national_perf_df,
    on='player_id',
    how='inner'
)
print(f"After national_perf_df merge: {merged_df.shape}")

# Merge injuries
merged_df = merged_df.merge(
    injuries_df,
    on='player_id',
    how='inner'
)
print(f"After injuries_df merge: {merged_df.shape}")

# Prepare tweets raw_data for merging
# 1. Create a mapping from clean_player_name to player_id
player_name_to_id = profiles_df[['clean_player_name', 'player_id']].drop_duplicates()

# 2. Aggregate tweets_df by player_name and calculate mean for sentiment metrics
#    First, ensure 'tweet_text' column is dropped before aggregation if it exists
#    Convert player_name to upper case for consistent matching
tweets_df['player_name_upper'] = tweets_df['player_name'].str.upper()
tweets_agg = tweets_df.drop(columns=['tweet_text', 'player_name'], errors='ignore').groupby('player_name_upper').mean(numeric_only=True).reset_index()

# 3. Merge aggregated tweets raw_data with player_name_to_id to get player_id
#    Use 'player_name_upper' from tweets_agg and 'clean_player_name' from mapping
tweets_with_id = tweets_agg.merge(player_name_to_id, left_on='player_name_upper', right_on='clean_player_name', how='left')

# Drop redundant player name columns before final merge
tweets_with_id = tweets_with_id.drop(columns=['player_name_upper', 'clean_player_name'], errors='ignore')

# Merge sentiment/tweets raw_data
merged_df = merged_df.merge(
    tweets_with_id,
    on='player_id',
    how='inner'
)
print(f"After tweets_with_id merge: {merged_df.shape}")

# OPTIONAL: merge teammates played with (if needed)
# merged_df = merged_df.merge(teammates_df, on='player_id', how='left')

# Save merged dataset
merged_df.to_csv("merged_data_initial.csv", index=False)

print("âœ… Merged Data saved to 'merged_data_initial.csv'.")
print(f"Merged Data Shape: {merged_df.shape}")
# === End of Refactored Cell ===

### ðŸ”¹  Cell 8


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
from google.colab import files

try:
    files.download('merged_final_data.csv')
    print("File download initiated.")
except Exception as e:
    print(f"Error downloading file: {e}")
# === End of Refactored Cell ===

### ðŸ”¹ Cell 9


#### 2.Feature Engineering


### ðŸ”¹ Cell 10


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
print("\n[STEP 2/5] WEEK 2: Feature Engineering...")
dataset_df = merged_df.copy()

# Calculate days_out for injury analysis
dataset_df['from_date'] = pd.to_datetime(dataset_df['from_date'])
dataset_df['end_date'] = pd.to_datetime(dataset_df['end_date'])
dataset_df['days_out'] = (dataset_df['end_date'] - dataset_df['from_date']).dt.days

# Calculate age from date_of_birth
dataset_df['date_of_birth'] = pd.to_datetime(dataset_df['date_of_birth'])
dataset_df['age'] = (pd.to_datetime('today').year - dataset_df['date_of_birth'].dt.year) - ((pd.to_datetime('today').month < dataset_df['date_of_birth'].dt.month) | \
          ((pd.to_datetime('today').month == dataset_df['date_of_birth'].dt.month) & (pd.to_datetime('today').day < dataset_df['date_of_birth'].dt.day)))

# Drop original date columns if not needed further
dataset_df = dataset_df.drop(columns=['from_date', 'end_date', 'date_of_birth'], errors='ignore')
# === End of Refactored Cell ===

### ðŸ”¹  Cell 11


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Ensure no division by zero
epsilon = 1e-6
dataset_df['matches'] = dataset_df['matches'].replace(0, 1)
dataset_df['minutes_played'] = dataset_df['minutes_played'].replace(0, 1)
# === End of Refactored Cell ===

### ðŸ”¹ Cell 12


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Basic Performance Metrics
dataset_df['goals_per_90_min'] = (dataset_df['goals_x'] * 90) / dataset_df['minutes_played']
dataset_df['assists_per_90_min'] = (dataset_df['assists'] * 90) / dataset_df['minutes_played']
dataset_df['G_A_per_match'] = (dataset_df['goals_x'] + dataset_df['assists']) / dataset_df['matches']
dataset_df['normalized_sentiment'] = (dataset_df['vader_polarity'] - dataset_df['vader_polarity'].min()) / \
                              (dataset_df['vader_polarity'].max() - dataset_df['vader_polarity'].min())
# === End of Refactored Cell ===

### ðŸ”¹ Cell 13


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
dataset_df['log_days_out'] = np.log1p(dataset_df['days_out'])
dataset_df['Injury_Impact_Index'] = (dataset_df['log_days_out'] * dataset_df['value_x']) / (dataset_df['matches'])
dataset_df['Value_Efficiency_Ratio'] = dataset_df['value_x'] / (dataset_df['G_A_per_match'] + epsilon)
# === End of Refactored Cell ===

### Cell 14


##### 3. Data Preprocessing Pipeline (Scikit-learn)



### ðŸ”¹ Cell 15


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
print("\n[STEP 3/5] Data Preprocessing Pipeline...")
# === End of Refactored Cell ===

### ðŸ”¹ Cell 16


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Define feature sets
numerical_features = ['age', 'value_x', 'matches', 'goals_x', 'assists',
                      'minutes_played', 'days_out',
                      'vader_polarity', 'tb_polarity',
                      'goals_per_90_min', 'assists_per_90_min', 'G_A_per_match',
                      'normalized_sentiment', 'Injury_Impact_Index', 'Value_Efficiency_Ratio'] # Removed Polarity_Index
categorical_features = ['citizenship', 'position', 'current_club_name', 'injury_reason'] # Corrected column names
# Removed 'last_update' and 'sentiment_score' from drop_features as they caused KeyError.
drop_features = ['player_id', 'player_name', 'log_days_out']
X = dataset_df.drop(columns=drop_features)
# === End of Refactored Cell ===

### ðŸ”¹  Cell 17


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Pipeline Transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle Missing Data
    ('scaler', StandardScaler())                    # Scale Numeric Data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Encode Categorical Data
])
# === End of Refactored Cell ===

### ðŸ”¹ Cell 18


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)
# === End of Refactored Cell ===

### ðŸ”¹  Cell 19


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Apply the Pipeline
X_processed = preprocessor.fit_transform(X)
feature_names = numerical_features + list(preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features))
processed_df = pd.DataFrame(X_processed, columns=feature_names)
# === End of Refactored Cell ===

### ðŸ”¹Cell 20


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# Save the final processed/processed raw_data
processed_df.to_csv("cleaned_processed_data.csv", index=False)
print("âœ… Final Processed Data saved to 'cleaned_processed_data.csv'.")
print(f"\nProcessed Data Head (Scaled, first 10 columns):\n{processed_df.iloc[:, :10].head()}")
# === End of Refactored Cell ===

### ðŸ”¹Cell 21
T

In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
import os
print(os.listdir('.'))
# === End of Refactored Cell ===

### ðŸ”¹  Cell 22


If you see `cleaned_processed_data.csv` in the list above, you can use the following code to download it directly:



### ðŸ”¹  Cell 23


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
from google.colab import files

try:
    files.download('cleaned_processed_data.csv')
    print("File download initiated.")
except Exception as e:
    print(f"Error downloading file: {e}")
# === End of Refactored Cell ===

cell 24

##### 4. Data Analysis & Visualization



### ðŸ”¹Cell 25


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# ==============================================================================
print("\n[STEP 4/5] WEEK 2: Exploratory Data Analysis (EDA) and Visualization...")
analysis_df = dataset_df.copy()

# Ensure market_value is available as 'value_x'
if 'value_x' not in analysis_df.columns:
    print("Warning: 'value_x' (market value) not found in analysis_df. Please ensure correct merging and column naming.")

# Calculate correlation matrix for numerical features
# Re-using the numerical_features list defined in H3Y-V_1kW9zO, ensuring it's up-to-date
current_numerical_features = ['age', 'value_x', 'matches', 'goals_x', 'assists',
                      'minutes_played', 'days_out',
                      'vader_polarity', 'tb_polarity',
                      'goals_per_90_min', 'assists_per_90_min', 'G_A_per_match',
                      'normalized_sentiment', 'Injury_Impact_Index', 'Value_Efficiency_Ratio']

# Filter to only existing numerical features to avoid KeyError
existing_numerical_features = [f for f in current_numerical_features if f in analysis_df.columns]

corr_matrix = analysis_df[existing_numerical_features].corr()
# === End of Refactored Cell ===

### ðŸ”¹ Cell 26


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# A. Correlation Matrix (Key Features)
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Key Features', fontsize=16)
plt.tight_layout()
plt.show()
# === End of Refactored Cell ===

### ðŸ”¹ Cell 27


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# B. Segment Analysis: Market Value by Position
position_value = analysis_df.groupby('position')['value_x'].mean().sort_values(ascending=False) # Changed market_value_million to value_x
plt.figure(figsize=(10, 6))
sns.barplot(x=position_value.index, y=position_value.values, palette='viridis')
plt.title('Average Market Value by Position', fontsize=16)
plt.ylabel('Average Market Value (Millions)', fontsize=12)
plt.xlabel('Position', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# === End of Refactored Cell ===

### ðŸ”¹Cell 28


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
# C. Relationship with Target Variable: Market Value vs. G_A_per_match
plt.figure(figsize=(10, 6))
sns.scatterplot(x='G_A_per_match', y='value_x', raw_data=analysis_df, hue='position', palette='tab10') # Changed market_value_million to value_x
plt.title('Market Value vs. Goals + Assists Per Match', fontsize=16)
plt.xlabel('Goals + Assists Per Match', fontsize=12)
plt.ylabel('Market Value (Millions)', fontsize=12)
plt.legend(title='Position', loc='upper right')
plt.tight_layout()
plt.show()
# === End of Refactored Cell ===

### ðŸ”¹  Cell 29


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
sns.lmplot(
    raw_data=analysis_df,
    x='age',
    y='value_x', # Changed market_value_million to value_x
    hue='position',    # Differentiate colors by position
    col='position',    # Create separate plots (columns) for better clarity
    height=4,
    aspect=1.2,
    scatter_kws={'alpha': 0.6},
    line_kws={'lw': 2},
    facet_kws={'sharex': False, 'sharey': False} # Use facet_kws for sharex and sharey to avoid UserWarning
)

plt.suptitle('Market Value vs. Age (Career Curve) Segmented by Position', y=1.05, fontsize=16)
plt.tight_layout()
plt.show()
# === End of Refactored Cell ===

### ðŸ”¹ Cell 30


##### 5. Final Insight Summary

>

### ðŸ”¹  Cell 31


In [None]:
# === Refactored Code Cell ===
# Purpose: Auto-modified for new notebook version
print("\n[STEP 5/5] WEEK 2: Final Insight Summary")
print("----------------------------------------------------------------------")
print("Key Findings for Model Development (Week 1 & 2):")
print("1. Performance Dominance: The G+A per Match metric shows a high positive correlation. On-field output is the primary driver of market value.")
print("2. Positional Value: Forwards and Midfielders command the highest average market values, confirming positional scarcity/demand patterns.")
print("3. Injury Complexity: The engineered 'Injury_Impact_Index' has a surprisingly strong positive correlation (approx. 0.58). This suggests that players who are highly valued and spend time injured are likely elite assets whose value holds, or whose absences are strategically managed.")
print("4. Sentiment Weakness: Simple Polarity Index shows a very weak negative correlation. Raw sentiment scores are currently poor predictors of market value compared to performance metrics.")
print("\nDataset Preparation Complete. 'cleaned_processed_data.csv' is ready for model training.")
print("----------------------------------------------------------------------")
# === End of Refactored Cell ===