In [12]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chi2_contingency
from scipy.special import inv_boxcox       # to use absolute value
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read cleanest data
data = pd.read_csv('fifa21_male2-post-exploration.csv')

In [13]:
def corr_destroyer(data, target, max_threshold=0.95):
    corr_data = data.corr()
    corr_target = corr_data[target]
    corr_data.drop(target, axis=1, inplace=True)
    corr_data.drop(target, axis=0, inplace=True)
    
    column_no = corr_data.shape[0]
    to_drop = []

    for i in range(0, column_no):
        for j in range(i + 1, column_no):
            if corr_data.iloc[i, j] > max_threshold:
                if corr_target.iloc[i] > corr_target.iloc[j]:
                    to_drop.append(corr_data.columns[j])
                else:                 
                    to_drop.append(corr_data.columns[i])
    
    to_drop = list(set(to_drop)) # Get unique values
    return to_drop

### 3. Modelling & Evaluation

#### 3.1. Define normalizing & modelling functions

##### 3.1.1. Data Scaling

In [14]:
def normalize_data(df):
    num = df.select_dtypes(np.number)
    transformer = MinMaxScaler().fit(num) 
    x_minmax = transformer.transform(num)
    num_norm = pd.DataFrame(x_minmax, columns=num.columns)
    return num_norm

##### 3.1.2. Box-Cox transformation

In [15]:
def boxcox_transform(data):
    numeric_cols = data.select_dtypes(np.number).columns
    _ci = {column: None for column in numeric_cols}
    for column in numeric_cols:
        if len(data[column].unique()) < 10:
            continue
        else:
            # since i know any columns should take negative numbers, to avoid -inf in df
            data[column] = np.where(data[column]<=0, np.NAN, data[column]) 
            data[column] = data[column].fillna(data[column].median())
            transformed_data, ci = stats.boxcox(data[column])
            data[column] = transformed_data
            _ci[column] = [ci] 
    return data, _ci

##### 3.1.3. Remove outliers

In [16]:
def remove_outliers(df, threshold=1.5):
    numerical = df.select_dtypes(np.number)
    columns = numerical.columns
    for column in columns:
        if len(df[column].unique()) < 10:
            continue
        else:
            upper = np.percentile(df[column], 75)
            lower = np.percentile(df[column], 25)
            iqr = upper - lower
            upper_limit = upper + threshold * iqr
            lower_limit = lower - threshold * iqr
            df = df[(df[column]>lower_limit) & (df[column]<upper_limit)]
    return df

##### 3.1.5. Encode categorical data (`get_dummies`)

In [17]:
def encode_cat(df):
    cat = df.select_dtypes(np.object)
    cat = pd.get_dummies(df, columns=df.columns, drop_first=True)
    return cat

##### 3.1.6. Concatenate numerical and categorical data

In [18]:
def new_df(num, cat):
    new_df = pd.concat([num, cat], axis=1)
    return new_df

##### 3.1.7. Running & evaluating the model

Looking at the R2 score:

In [19]:
def regression(df, target):
    y = df[target]
    X = df.drop(target, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    return r2, predictions, y_test

Looking at the predictions versus the real data and analyzing the MSE, RMSE, and MAE

In [20]:
# Reversing the Box-cox transformation
def reversing(predictions, _ci, target):
    predictions = inv_boxcox(predictions, _ci[target][0])
    return predictions

# Calculate the modified metrics
def evaluate_model_2(y_test, predictions):
    RMSE = mean_squared_error(y_test, predictions, squared=False)
    MSE = mean_squared_error(y_test, predictions)
    MAE = mean_absolute_error(y_test, predictions)
    return print("RMSE =", RMSE), print("MSE =", MSE), print("MAE =", MAE)

# Create dataframe for visualising the differences between real and predicted values
def diff_df(y_test, _ci, target, predictions):
    results = pd.DataFrame()
    results['true'] = inv_boxcox(y_test, _ci[target][0])
    results['pred'] = predictions
    results['diff'] = results.apply(lambda x: abs(x['true'] - x['pred']), axis=1)
    results = results.sort_values('diff', ascending=False).head(20)
    return results

# Plot results for visual representation
def we_like_to_see(results):
    beautiful_graph = sns.regplot(results['true'], results['pred'])
    return beautiful_graph

Looking at the R2 adjusted to check which features contribute to the R2 score:

#### 3.2. Optimize model for the `overall_score`

In [21]:
# Create dataframe copy
data_1 = data.copy()

# Remove correlated data
to_drop = corr_destroyer(data_1, target='overall_score')
data_1.drop(to_drop, axis=1, inplace=True)

# 1. Remove outliers
data_1 = remove_outliers(data_1)

# 2. Box-Cox transform
data_1, _ci_1 = boxcox_transform(data_1)

# 3. Scale numerical data
num_norm_1 = normalize_data(data_1)

# 4. Encode categorical data
cat_1 = encode_cat(data_1)

# 5. Merge numerical & categorical data
new_df_1 = new_df(num_norm_1, cat_1)

# 6. Run regression
r2_1, predictions_1, y_test_1 = regression(new_df_1, 'overall_score')
print(r2_1)

# 7. See the predictions
predictions_1 = reversed(predictions_1)
evaluate_model_2(y_test_1, predictions_1)
results = diff_df(y_test_1, _ci_1, target='overall_score', predictions=predictions_1)
we_like_to_see(results)