In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In recent years, Deep Learning models and architectures have gained a lot of traction while dealing with image data, as well as, for providing solutions to NLP based problems. This has been a result of some excellent and ground breaking research work that we have seen over the years in the field of Deep Learning. On the other hand, traditional tree-based models like RandomForrest and XGBoost have maintained their stronghold and have proved to be really successful and efficient when dealing with Tabular (Structured) data, especially in regression problems.
Today, we will be looking at an approach to apply Deep Learning on Tabular data in order to solve a regression problem using FastAi.

In [None]:
!pip install fastai==1.0.61 --no-deps
# fastai depends also on an older version of torch
!pip install torch==1.6.0 torchvision==0.7.0

### Problem Statement:

Football (Soccer) in the modern times has become much complicated, than it ever was. For clubs all around the world, it’s not just about playing your heart out in the field, but also perform well in the transfer markets, to snap up the right talent and players for their sides, at the right price. Over the past few years we have observed a serious inflation in player values and some exuberant release clauses (price inserted in a player’s contract with his/her current club for which he/she can be bought by another club).
As a result of this, a many clubs end up paying a lot more for a player whose talent and performances on the field fail to justify his/her price tag. A similar problem is faced by the club selling a player, where they fail to realise his/her potential, and let him/her go for a price which was way less than the actual price they should have asked for.
We shall try to solve this as a regression problem using Deep Learning. We will be making use of the fast.ai’s tabular module to predict a player’s value based on his/her skill and personality attributes.


### Assumption

The target value (Price) for which we will train our model, could already consist of the bias of over/under valuing the players. We will build our solution based on the assumption that these prices for the players are highly curated and based on the research and analysis of experts of this domain.

### Solution:

In [None]:
import pandas as pd
import math
import datetime
import fastai
from fastai.tabular import *
from fastai.tabular.all import *
from fastai.imports import *
from fastai.metrics import error_rate
# from fastai.callbacks import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, MinMaxScaler

In [None]:
fastai.__version__

In [None]:
??fastai.tabular.all

#### Data

In [None]:
players_df = pd.read_csv('/kaggle/input/fifa-20-complete-player-dataset/players_20.csv')
players_df

#### Data Pre-Processing

Firstly, we will apply it to the value_eur column. We will change all the values with 0 to 1, so that we do not run into the “Division by 0” problem later on.
The next one would be the loaned_from column, which we will use to create another one, loaned_status. Instead of blank values and the club names the player is loaned to, we will use Boolean values telling us whether the player is on loan to another club or not.
Lastly, we will use the column contract_valid_until to generate contract_expiry_in, so that we use can use the number of years left on a player’s to assess his value.

In [None]:
players_df['value_eur'] = players_df['value_eur'].apply(lambda x : 1 if x == 0 else x)

In [None]:
len(players_df['value_eur'])

In [None]:
players_val = np.array(players_df[['value_eur']])
# players_val = players_val.reshape(-1,1)
val_eur_normalised = MinMaxScaler().fit_transform(players_val)
val_eur_normalised[9989]


In [None]:
players_df['value_eur'] = val_eur_normalised
len(players_df['value_eur'])

In [None]:
players_df['loaned_status'] = players_df['loaned_from'].fillna('').apply(lambda x:'no' if x == '' else 'yes')

In [None]:
curr_year = 2019
players_df['contart_expiry_in'] = players_df['contract_valid_until'].apply(lambda x : x-curr_year)
players_df

Next, we ought to perform some data pre-processing steps like filling missing data, categorizing and normalizing the columns. With the fast.ai library, this is rather easy, we specify the pre-processing methods in a list, and use it later at the time of creating our fast.ai DataBunch for training.

In [None]:
procs = [FillMissing, Categorify, Normalize]

#### Building the DataBunch


First, we’ll put all categorical fields in a list cat_var, and all continuous fields in another list cont_var . These two variables will be used to construct the fast.ai DataBunch.

In [None]:
cont_var = ['age','height_cm','weight_kg','overall','potential','wage_eur','international_reputation','weak_foot','skill_moves','release_clause_eur',
            'pace','shooting','passing','dribbling','defending','physic','gk_diving','gk_handling','gk_kicking','gk_reflexes','gk_speed','gk_positioning',
            'attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys',
            'skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed',
            'movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots',
            'mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure', 
            'defending_marking','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking',
            'goalkeeping_positioning','goalkeeping_reflexes','contart_expiry_in']

In [None]:
cat_var = ['preferred_foot','work_rate','body_type','team_position','nation_position','loaned_status','player_traits']

Next up, we specify the dependent variable and keep only the specified continuous and categorical variables.

In [None]:
dep_var = 'value_eur'
players_df = players_df[cat_var + cont_var + [dep_var]].copy()

In [None]:
players_df

After this, we will spit the data into training and test so that we have a test dataset for our trained model to assess its performance later on a data that it has never seen before. We split the data 80-20 here, and create a TabularList from it.

In [None]:
players_df_train, players_df_test = train_test_split(players_df, test_size = 0.2, random_state = 0)
players_df_train.shape,players_df_test.shape

In [None]:
# Test tabularlist
test = TabularPandas(players_df_test, cat_names=cat_var, cont_names=cont_var, procs=procs)

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(players_df_train))

In [None]:
dep_var

In [None]:
# Train data bunch
to = TabularPandas(players_df_train, procs=procs, cat_names=cat_var, cont_names=cont_var, y_names = dep_var, splits=splits)
                                                

In [None]:
dls = to.dataloaders(bs=64)

In [None]:
dls.show_batch()

In [None]:
learn = tabular_learner(dls, layers=[200,100], metrics=rmse, ps=[0.001,0.01], emb_drop=0.01)

In [None]:
learn.model

In [None]:
# select the appropriate learning rate
learn.lr_find()

# we typically find the point where the slope is steepest
# learn.recorder.before_fit()

In [None]:
# Fit the model based on selected learning rate
learn.fit_one_cycle(10, 5e-2)