# Setting up the environment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the data

In [None]:
player_df = pd.read_csv("/kaggle/input/fifa-20-complete-player-dataset/players_20.csv")
player_df.info()

In [None]:
player_df.head()

In [None]:
player_df.sample(3).T

In [None]:
features = np.array(player_df.columns)
print(features)

# Discarding useless columns

In [None]:
player_df.drop(['sofifa_id','player_url'], axis=1, inplace=True)
player_df.head()

# Exploring the NaN values

In [None]:
features = np.array(player_df.columns)
nan_quantity = player_df.isnull().sum()
nan_check = nan_quantity>0
nan_features = np.array([[features[i],nan_quantity[i]] for i in range(len(features)) if nan_check[i]])
nan_features_df = pd.DataFrame(nan_features, columns=['column','sum'])
nan_features_df['sum'] = pd.to_numeric(nan_features_df['sum'], errors='raise', downcast=None)
nan_features_df.sample(8)

In [None]:
nan_check_5k = nan_quantity>=5000
nan_features_5k = np.array([[features[i],nan_quantity[i]] for i in range(len(features)) if nan_check_5k[i]])
nan_features_df_5k = pd.DataFrame(nan_features_5k, columns=['column','sum'])
nan_features_df_5k['sum'] = pd.to_numeric(nan_features_df_5k['sum'], errors='raise', downcast=None)

nan_check_1k = (nan_quantity>1000) & (nan_quantity<5000)
nan_features_1k = np.array([[features[i],nan_quantity[i]] for i in range(len(features)) if nan_check_1k[i]])
nan_features_df_1k = pd.DataFrame(nan_features_1k, columns=['column','sum'])
nan_features_df_1k['sum'] = pd.to_numeric(nan_features_df_1k['sum'], errors='raise', downcast=None)

nan_check_s = (nan_quantity<=1000) & (nan_quantity>0)
nan_features_s = np.array([[features[i],nan_quantity[i]] for i in range(len(features)) if nan_check_s[i]])
nan_features_df_s = pd.DataFrame(nan_features_s, columns=['column','sum'])
nan_features_df_s['sum'] = pd.to_numeric(nan_features_df_s['sum'], errors='raise', downcast=None)

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x="column", y="sum", data=nan_features_df_s).set_title("NaN Values lesser than 1k (and greater than 0)")
plt.figure(figsize=(40,8))
ax = sns.barplot(x="column", y="sum", data=nan_features_df_1k).set_title("NaN Values greater than 1k and lesser than 5k")
plt.figure(figsize=(20,8))
ax = sns.barplot(x="column", y="sum", data=nan_features_df_5k).set_title("NaN Values greater than 5k")

As some of the columns like, 'player_tags', 'loaned_from', etc have a lot of NaN values, it will be wise to drop those columns.

In [None]:
no_nan_check = nan_quantity==0
no_nan_features = np.array([[features[i]] for i in range(len(features)) if no_nan_check[i]])
no_nan_features_df = pd.DataFrame(no_nan_features, columns=['columns'])
no_nan_features_df

# Height, Weight and Age Trends

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
sns.histplot(player_df,ax=axes[0], x="height_cm")
sns.histplot(player_df,ax=axes[1], x="weight_kg")
sns.histplot(player_df,ax=axes[2], x="age")

We can see that most of the players are around 180cm tall, 75kgs heavy and 23 years old.

# Age vs Wage

In [None]:
plt.figure(figsize=(10, 10))
sns.scatterplot(data=player_df, x="age", y="wage_eur").set_title("Wage vs age")

Players are paid the highest between the age of 24 and 34.

# Preferred Foot

In [None]:
sns.countplot(x='preferred_foot',data=player_df)

Most of the players prefer right foot.

# Country of Origin

In [None]:
import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)

In [None]:
nations = player_df["nationality"].value_counts().reset_index()
nations.columns = ['nationality','num_of_players']
nations['nationality'] = nations['nationality'].replace(['England'],'United Kingdom')
nations = nations.sort_values(by="num_of_players",ascending=False)
nations.head()

Most of the players are the UK.

In [None]:
data = dict(
        type = 'choropleth',
        colorscale = 'thermal',
        locations = nations['nationality'],
        locationmode = "country names",
        z = nations['num_of_players'],
        text = nations['nationality'],
        colorbar = {'title' : 'Number of players'})

In [None]:
layout = dict(title = 'Nationality Distribution',
              geo = dict(projection = {'type':'mercator'}))

In [None]:
choromap = go.Figure(data = [data],layout = layout)
iplot(choromap,validate=False)

# International Reputation

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 8), sharey=False)
sns.countplot(x='international_reputation',ax=axes[0],data=player_df).set(yscale="linear")
axes[0].set_title("Linear")
sns.countplot(x='international_reputation',ax=axes[1],data=player_df).set(yscale="log")
axes[1].set_title("Logrithmic")

The number of players at each reputation level differ almost by the factor of 10.

# Month of Birth

In [None]:
player_df['month_of_birth'] = pd.to_datetime(player_df['dob']).dt.month
sns.countplot(x='month_of_birth', data= player_df)

1. Most of the players were born in February.
2. Most of the players are born in the starting of the year.

Consider upvoting :)