In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score,GridSearchCV, KFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
import pickle as pkl

In [2]:
#QUESTION 1

In [3]:
def preprocess_data(file_path, columns_to_drop):
    # Step 1: Read the CSV file
    data = pd.read_csv(file_path, low_memory = False)
    
    # Step 2: Drop specified columns
    data = data.drop(columns=columns_to_drop, axis=1)
    
    # Step 3: Drop columns with 30% or more null values
    biased_threshold = 0.30 * len(data)
    data = data.loc[:, data.isna().sum() < biased_threshold]
    
    # Step 4: Separate numeric data from non-numeric data
    numeric_data = data.select_dtypes(include=np.number)
    non_numeric_data = data.select_dtypes(include=['object'])
    
    # Step 5: Impute missing values for numeric data with mean
    numeric_data = numeric_data.apply(lambda col: col.fillna(col.mean()))
    #imp = IterativeImputer(max_iter=10, random_state=0)
    #numeric_data_imputed = pd.DataFrame(np.round(imp.fit_transform(numeric_data)), columns=numeric_data.columns)

    
    # Step 6: Impute missing values for non-numeric data with mode
    for column in non_numeric_data.columns:
        mode = non_numeric_data[column].mode()[0]
        non_numeric_data[column].fillna(mode, inplace=True)
    
    # Step 7: encode categorical columns
    # Assuming non_numeric_data is a DataFrame containing non-numeric columns
    label_encoded_data = non_numeric_data.copy()

    # Initialize LabelEncoder
    le = LabelEncoder()

    # Iterate through each column and apply LabelEncoder
    for column in label_encoded_data.columns:
        label_encoded_data[column] = le.fit_transform(label_encoded_data[column])

    
    # Combine numeric data and one-hot encoded data
    processed_data = pd.concat([numeric_data, label_encoded_data], axis=1)

    processed_data = pd.DataFrame(processed_data)
    
    return processed_data

# Example usage:
file_path = r"C:\Users\Dell Inspiron\Documents\School_2024_2\Intro to AI\male_players (legacy).csv"
columns_to_drop = [
    'player_id', 'dob', 'player_tags', 'club_contract_valid_until_year', 'player_url', 'club_jersey_number','fifa_update',
    'long_name', 'short_name', 'league_id', 'player_face_url', 'nationality_id','preferred_foot','club_contract_valid_until_year',
    'fifa_update_date', 'club_position', 'league_name', 'club_team_id', 'nation_team_id', 'player_traits','club_joined_date','league_level',
    'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'ldm', 'cdm', 'rdm',  'lb', 'cb','nationality_name','real_face',
    'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lwb', 'rwb', 'lcb', 'rcb', 'rb', 'gk', 'player_face_url', 'ls', 'st','body_type','fifa_version'
]
processed_data = preprocess_data(file_path, columns_to_drop)
processed_data.head()


Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,weak_foot,skill_moves,international_reputation,...,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,player_positions,club_name,work_rate
0,93,95,100500000.0,550000.0,27,169,67,3,4,5,...,21,20,6,11,15,14,8,412,478,7
1,92,92,79000000.0,375000.0,29,185,80,4,5,5,...,31,23,7,11,15,14,11,1128,1013,1
2,90,90,54500000.0,275000.0,30,180,80,2,4,5,...,26,26,10,8,11,5,15,1541,481,1
3,90,90,52500000.0,275000.0,32,195,95,4,4,5,...,41,27,13,15,10,9,12,1838,947,7
4,90,90,63500000.0,300000.0,28,193,92,4,1,5,...,25,25,87,85,92,90,86,719,481,8


In [4]:
processed_data.isnull().sum()

overall                        0
potential                      0
value_eur                      0
wage_eur                       0
age                            0
height_cm                      0
weight_kg                      0
weak_foot                      0
skill_moves                    0
international_reputation       0
pace                           0
shooting                       0
passing                        0
dribbling                      0
defending                      0
physic                         0
attacking_crossing             0
attacking_finishing            0
attacking_heading_accuracy     0
attacking_short_passing        0
attacking_volleys              0
skill_dribbling                0
skill_curve                    0
skill_fk_accuracy              0
skill_long_passing             0
skill_ball_control             0
movement_acceleration          0
movement_sprint_speed          0
movement_agility               0
movement_reactions             0
movement_b

#QUESTION 2

In [5]:
#QUESTION 2: Getting the right feauture subsets
#Feature Importance using RandomForest.
#Picking the strongly correlated variables from the lot
# Split the data into training and testing sets
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(processed_data.drop('overall', axis=1), processed_data['overall'], test_size=0.2, random_state=0)


In [6]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor with fewer estimators
forest = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)

# Fit the model on the training data
forest.fit(X_train, y_train)

# Extract feature names
feature_names = X_train.columns

# Print feature importances
#for name, score in zip(feature_names, forest.feature_importances_):
 #   print(name, score)


In [7]:
# Extract feature importances and sort them
feature_importances = forest.feature_importances_
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Select top 10 features 
top_N = 13  # You can change this number based on your requirement
top_features = importance_df['Feature'].head(top_N).tolist()
top_values = importance_df['Importance'].head(top_N).tolist()

print(f"Top features selected: {top_features} and their respective values are {top_values}.")

#Creating new dataframe with just strongly correlated features
new_x = processed_data[top_features]

#Scaling the independent variables
X = scaler.fit_transform(new_x)

Top features selected: ['value_eur', 'age', 'potential', 'movement_reactions', 'wage_eur', 'mentality_composure', 'defending', 'dribbling', 'international_reputation', 'skill_ball_control', 'physic', 'attacking_crossing', 'power_stamina'] and their respective values are [0.8061502371747016, 0.07961740791396689, 0.04524483849027903, 0.020722940979943403, 0.014831904000098216, 0.006322719343543349, 0.004455332214775699, 0.002807391169087853, 0.0016895696636347424, 0.0010794540575282648, 0.0009996708447830157, 0.0008725619748060436, 0.0008194219717347221].


#QUESTION 3

In [8]:
## Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, processed_data['overall'], test_size=0.2, random_state=0)

In [9]:
#Using Random Forest Regressor to train and predict the values
model_rf=RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
model_rf.fit(X_train, y_train)
y_pred_rf =model_rf.predict(X_test)
#print(y_pred_rf)

#Testing the accuracy of the Random Forest Prediction
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print("The mean absolute error without cross validation is : ", mae_rf)

The mean absolute error without cross validation is :  0.4581941393074853


In [10]:
#Cross-validation for the RandomTreeRegressor method to improve the data predictions
kf = KFold(n_splits = 5,shuffle=True, random_state=42)
scores = cross_val_score(model_rf, X_train, y_train, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
scores = abs(scores)
#print("Cross-validation Scores (MAE):", scores)
print("The average error with cross-validation is", scores.mean())

The average error with cross-validation is 0.4669714990494769


In [11]:
#Using exGradientBooster(xgbooster) to train datasets
model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, objective='reg:squarederror')
model.fit(X_train, y_train)
y_pred_xg = model.predict(X_test)

mae_xg = mean_absolute_error(y_test, y_pred_xg)
print("The mean absolute error without cross validation with an xgbooster is :",mae_xg)

The mean absolute error without cross validation with an xgbooster is : 0.8827498132286824


In [12]:
#Cross-validation for the xgboost method to improve the data predictions
kf = KFold(n_splits = 5,shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
scores = abs(scores)
#print("Cross-validation Scores (MAE):", scores)
print('The mean absolute error cross validation with an xgbooster is :', scores.mean())

The mean absolute error cross validation with an xgbooster is : 0.8787067953670997


In [13]:
#Using GradientBooster to train datasets
model_g = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.1, loss='squared_error')
model_g.fit(X_train, y_train)
y_pred_g = model_g.predict(X_test)

mae_g = mean_absolute_error(y_test, y_pred_g)
print("The mean absolute error without cross validation with a gradient booster is :",mae_g)


The mean absolute error without cross validation with a gradient booster is : 0.8821158377140134


In [14]:
#Cross-validation for the xgboost method to improve the data predictions
kf = KFold(n_splits = 5,shuffle=True, random_state=42)
scores = cross_val_score(model_g, X_train, y_train, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
scores = abs(scores)
#print("Cross-validation Scores (MAE):", scores)
print('The mean absolute error cross validation with a gradient booster is :', scores.mean())

The mean absolute error cross validation with a gradient booster is : 0.8776682384856389


#QUESTION 4

In [15]:
#Fine tuning models to get a lower MAE value
#RandomForestRegressorModel since it showed the lowest MAE
cv = KFold(n_splits = 3)
grid_hp = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}
grid_search = GridSearchCV( estimator = model_rf, param_grid = grid_hp, scoring = 'neg_mean_absolute_error', cv = cv, n_jobs = -1, verbose = 0)

#Fitting grid_search onto the data
grid_search.fit(X_train, y_train)



In [16]:
# Best hyperparameters found
print("Best Hyperparameters:", grid_search.best_params_)

# Best model from grid search
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
print("Mean Absolute Error (MAE) with the best model:", mae_best)

Best Hyperparameters: {'max_depth': 30, 'n_estimators': 300}
Mean Absolute Error (MAE) with the best model: 0.455822732357714


In [17]:
#Retraining and retesting the model with cross validation
kf = KFold(n_splits = 5,shuffle=True, random_state=42)
scores = cross_val_score(best_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=kf, n_jobs=-1)
scores = abs(scores)
#print("Cross-validation Scores (MAE):", scores)
print('The mean absolute error cross validation with the best model is :', scores.mean())

The mean absolute error cross validation with the best model is : 0.46480569168201014


In [18]:
#Retraing and retesting the model without cross validation
best_model.fit(X_train, y_train)
y_pred_b = best_model.predict(X_test)
mae_b = mean_absolute_error(y_test, y_pred_b)
print("The mean absolute error without cross validation is :",mae_b)

The mean absolute error without cross validation is : 0.455822732357714


#QUESTION 5

In [19]:
#Testing model with new dataset
#Processing testing data
file_path = r"C:\Users\Dell Inspiron\Documents\School_2024_2\Intro to AI\players_22-1.csv"
columns_to_drop = [
    'sofifa_id', 'dob', 'player_tags', 'club_contract_valid_until', 'player_url', 'club_jersey_number',
    'long_name', 'short_name', 'player_face_url', 'nationality_id','preferred_foot',
     'club_position', 'league_name', 'club_team_id', 'nation_team_id', 'player_traits','club_joined','league_level',
    'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'ldm', 'cdm', 'rdm',  'lb', 'cb','nationality_name','real_face',
    'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lwb', 'rwb', 'lcb', 'rcb', 'rb', 'gk', 'player_face_url', 'ls', 'st','body_type','nation_jersey_number',
    'nation_position', 'club_logo_url','club_flag_url', 'nation_logo_url','nation_flag_url'
]

testing_data = preprocess_data(file_path, columns_to_drop)
testing_data.head()

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,weak_foot,skill_moves,international_reputation,...,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,player_positions,club_name,work_rate
0,93,93,78000000.0,320000.0,34,170,72,4,4,5,...,35,24,6,11,15,14,8,604,470,7
1,92,92,119500000.0,270000.0,32,185,81,4,4,5,...,42,19,15,6,12,8,10,635,241,2
2,91,91,45000000.0,270000.0,36,187,83,4,5,5,...,32,24,7,11,15,14,11,658,412,1
3,91,91,129000000.0,270000.0,29,175,68,5,5,5,...,32,29,9,9,15,15,11,372,470,2
4,91,91,125500000.0,350000.0,30,181,70,5,4,4,...,65,53,15,13,5,10,13,168,411,0


In [20]:
## Split the data into training and testing sets
test_X = testing_data[top_features]
test_X = scaler.fit_transform(test_X)
newX_train, newX_test, newy_train, newy_test = train_test_split(test_X, testing_data['overall'], test_size=0.2, random_state=0)

In [21]:
#Testing the performance of the trained model training new data with MAE metrics
best_model.fit(newX_train, newy_train)
test_pred = best_model.predict(newX_test)
mae_test = mean_absolute_error(newy_test, test_pred)
print(f'This is the MAE value for the testing the model with the new data: {mae_test}. Therefore showing relatively how good the model is as its prediction varies slightly from the actual value.')

This is the MAE value for the testing the model with the new data: 0.2698891198891198. Therefore showing relatively how good the model is as its prediction varies slightly from the actual value.


In [22]:
mae_test

0.2698891198891198

#QUESTION 6

In [23]:
#Employing the model on a website
#Saving the model
with open(r'C:\Users\Dell Inspiron\Documents\School_2024_2\Intro to AI.pkl', 'wb') as file:
    pkl.dump(best_model, file)

In [33]:
rmse = mean_squared_error(newy_test, test_pred, squared=False)
rmse



0.49878517238050313

In [29]:
# Find the average of the overall feature and use that as a true value/target value for finding the confidence level
processed_data['overall'].mean()

65.69907106564428

In [31]:
#Saving scaler as module to scale input to model
with open(r'C:\Users\Dell Inspiron\Documents\School_2024_2\Intro to AI.pkl', 'wb') as file:
    pkl.dump(scaler, file)