In [1]:
import pandas as pd

data = pd.read_csv('./FIFA2.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18159 entries, 0 to 18158
Data columns (total 83 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      18159 non-null  object 
 1   Age                       18159 non-null  int64  
 2   Nationality               18159 non-null  object 
 3   Overall                   18159 non-null  int64  
 4   Potential                 18159 non-null  int64  
 5   Club                      17918 non-null  object 
 6   Value                     18159 non-null  int64  
 7   Wage                      18159 non-null  int64  
 8   Special                   18159 non-null  int64  
 9   Preferred Foot            18159 non-null  object 
 10  International Reputation  18159 non-null  float64
 11  Weak Foot                 18159 non-null  float64
 12  Skill Moves               18159 non-null  float64
 13  Work Rate                 18159 non-null  object 
 14  Body T

In [2]:
map_position = {'FW':0, 'MD':1, 'DF':2, 'GK':3}
col = ['Position simplified']
data[col] = data[col].applymap(map_position.get)
data['Position simplified']

0        0
1        0
2        0
3        3
4        1
        ..
18154    1
18155    0
18156    0
18157    0
18158    1
Name: Position simplified, Length: 18159, dtype: int64

In [3]:
X = data.loc[:, 'Crossing':'GKReflexes']
XX = X.drop('Strength', axis=1)
y = data.loc[:, 'Position simplified']
xy = pd.concat([X, y], axis=1)
xxy = pd.concat([XX, y], axis=1)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [5]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix
def get_model_train_eval(model, X_train=None, X_test=None, y_train=None, y_test=None):
    model.fit(X_train, y_train)
    print('{} Test Accuracy: {}%'.format(model, round(model.score(X_test, y_test)*100, 2)))


    pred_model = model.predict(X_test)
    print('{} report:{}\n'.format(model.__class__.__name__, classification_report(y_test, pred_model)))

In [11]:
import multiprocessing
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

param_grid = [
    {'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50, 100.0], 
    'logisticregression__intercept_scaling': [1],
    'logisticregression__max_iter': [50, 75, 100, 125, 150],
    'logisticregression__solver': ["lbfgs"],
    'logisticregression__multi_class' : ["multinomial"]
    }
]

grid_model_lr = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lr.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   13.9s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:   41.8s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50,
                                                    100.0],
                          'logisticregression__intercept_scaling': [1],
                          'logisticregression__max_iter': [50, 75, 100, 125,
                                                           150],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__solver': ['lbfgs']}],
             verbose=True)

In [12]:
grid_model_lr.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, max_iter=75,
                                    multi_class='multinomial',
                                    random_state=0))])

In [13]:
get_model_train_eval(grid_model_lr, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   15.1s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:   39.4s finished
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50,
                                                    100.0],
                          'logisticregression__intercept_scaling': [1],
                          'logisticregression__max_iter': [50, 75, 100, 125,
                                                           150],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregressi

In [14]:
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, y, test_size=0.2, random_state=0, stratify=y)

In [15]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

param_grid = [
    {'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50, 100.0], 
    'logisticregression__intercept_scaling': [1],
    'logisticregression__max_iter': [50, 75, 100, 125, 150],
    'logisticregression__solver': ["lbfgs"],
    'logisticregression__multi_class' : ["multinomial"]
    }
]

grid_model_lr2 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lr2.fit(XX_train, yy_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   11.4s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:   36.7s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50,
                                                    100.0],
                          'logisticregression__intercept_scaling': [1],
                          'logisticregression__max_iter': [50, 75, 100, 125,
                                                           150],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__solver': ['lbfgs']}],
             verbose=True)

In [17]:
grid_model_lr2.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, max_iter=75,
                                    multi_class='multinomial',
                                    random_state=0))])

In [18]:
get_model_train_eval(grid_model_lr2, XX_train, XX_test, yy_train, yy_test)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.0s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:   28.4s finished
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.01, 0.1, 1.0, 10.0, 50,
                                                    100.0],
                          'logisticregression__intercept_scaling': [1],
                          'logisticregression__max_iter': [50, 75, 100, 125,
                                                           150],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregressi

In [22]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

param_grid = [
    {'logisticregression__C': [0.06, 0.08, 0.1, 1.2, 1.4], 
    'logisticregression__intercept_scaling': [0, 1, 2],
    'logisticregression__max_iter': [65, 70, 75, 80, 85, 90],
    'logisticregression__solver': ["lbfgs"],
    'logisticregression__multi_class' : ["multinomial"]
    }
]

grid_model_lr3 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lr3.fit(X_train, y_train)

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   38.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 270 out of 270 | elapsed:  1.7min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.06, 0.08, 0.1, 1.2, 1.4],
                          'logisticregression__intercept_scaling': [0, 1, 2],
                          'logisticregression__max_iter': [65, 70, 75, 80, 85,
                                                           90],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__solver': ['lbfgs']}],
             verbose=True)

In [23]:
grid_model_lr3.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.08, intercept_scaling=0, max_iter=70,
                                    multi_class='multinomial',
                                    random_state=0))])

In [24]:
get_model_train_eval(grid_model_lr3, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 90 candidates, totalling 270 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   52.7s
[Parallel(n_jobs=8)]: Done 270 out of 270 | elapsed:  1.3min finished
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.06, 0.08, 0.1, 1.2, 1.4],
                          'logisticregression__intercept_scaling': [0, 1, 2],
                          'logisticregression__max_iter': [65, 70, 75, 80, 85,
                                                           90],
                          'logisticregression__multi_class': ['multinomial'],
                          'logis

In [25]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))

param_grid = [
    {'logisticregression__C': [0.075, 0.08, 0.085], 
    'logisticregression__intercept_scaling': [0,0.25,0.5,0.75],
    'logisticregression__max_iter': [68, 69, 70, 71, 72],
    'logisticregression__solver': ["lbfgs"],
    'logisticregression__multi_class' : ["multinomial"]
    }
]

grid_model_lr4 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lr4.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   11.2s
[Parallel(n_jobs=8)]: Done 180 out of 180 | elapsed:   51.0s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.075, 0.08, 0.085],
                          'logisticregression__intercept_scaling': [0, 0.25,
                                                                    0.5, 0.75],
                          'logisticregression__max_iter': [68, 69, 70, 71, 72],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__solver': ['lbfgs']}],
             verbose=True)

In [26]:
grid_model_lr4.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.08, intercept_scaling=0, max_iter=69,
                                    multi_class='multinomial',
                                    random_state=0))])

In [27]:
get_model_train_eval(grid_model_lr4, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   10.1s
[Parallel(n_jobs=8)]: Done 180 out of 180 | elapsed:   44.5s finished
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.075, 0.08, 0.085],
                          'logisticregression__intercept_scaling': [0, 0.25,
                                                                    0.5, 0.75],
                          'logisticregression__max_iter': [68, 69, 70, 71, 72],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__solver': ['lbfgs']}],
             ve

In [40]:
param_grid = [
    {'logisticregression__C': [0.06, 0.08, 0.1], 
    'logisticregression__intercept_scaling': [-90, -85, -80, -70],
    'logisticregression__max_iter': [64, 69, 74],
    'logisticregression__solver': ["lbfgs"],
    'logisticregression__multi_class' : ["multinomial"],
    'logisticregression__penalty' : ['l1', 'l2']
    }
]

grid_model_lr5 = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    n_jobs=multiprocessing.cpu_count(),
    cv=3,
    verbose=True
)

grid_model_lr5.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   36.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   58.2s
[Parallel(n_jobs=8)]: Done 216 out of 216 | elapsed:  1.1min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.06, 0.08, 0.1],
                          'logisticregression__intercept_scaling': [-90, -85,
                                                                    -80, -70],
                          'logisticregression__max_iter': [64, 69, 74],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisticregression__penalty': ['l1', 'l2'],
                          'logisticregression__solver': ['lbfgs']}],
             verbose=True)

In [41]:
grid_model_lr5.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.08, intercept_scaling=-90, max_iter=69,
                                    multi_class='multinomial',
                                    random_state=0))])

In [42]:
get_model_train_eval(grid_model_lr5, X_train, X_test, y_train, y_test)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  46 tasks      | elapsed:    7.1s
[Parallel(n_jobs=8)]: Done 200 tasks      | elapsed:   26.0s
[Parallel(n_jobs=8)]: Done 216 out of 216 | elapsed:   28.1s finished
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('logisticregression',
                                        LogisticRegression(random_state=0))]),
             n_jobs=8,
             param_grid=[{'logisticregression__C': [0.06, 0.08, 0.1],
                          'logisticregression__intercept_scaling': [-90, -85,
                                                                    -80, -70],
                          'logisticregression__max_iter': [64, 69, 74],
                          'logisticregression__multi_class': ['multinomial'],
                          'logisti