In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# First look at the data

A lot of cleaning will be required before we can proceed for any type of proper analysis:
1. Converting salary to numeric values by replacing the '$' sign.
2. Extracting height in meters for numerical analysis.
3. Extracting weight in kg for numerical analysis.
4. Converting b_day to date format.
5. Missing values in team and college, if required.

In [None]:
df = pd.read_csv('/kaggle/input/nba2k20-player-dataset/nba2k20-full.csv')
df

There are 14 columns in the data set:

In [None]:
pd.DataFrame(df.columns)

Only team and college columns include missing values.

In [None]:
df.info()

# Data cleaning functions

## Replacing $ signs

In [None]:
def a(salary):
    salary = salary.replace('$', '')
    return salary

In [None]:
df['salary'] = df['salary'].apply(a)
df['salary'] = df['salary'].astype('int64')

In [None]:
df

In [None]:
df.info()

# Some stats about salaries

The minimum salary is USD 50,000, while the maximum salary is USD 40,231,758. The mean is very high compared to the median indicating that the data has outliers which is affecting the mean.

In [None]:
pd.DataFrame(df['salary'].describe()).T

In [None]:
f, axes = plt.subplots(1,2,figsize=(15,5))

sns.violinplot(data=df, y='salary', ax = axes[0])
sns.distplot(df['salary'], ax = axes[1])
plt.show()

# Positions

The positions are as per below:

G - Guard
F - Forward
C - Center

The most popular position tends to be guard, and it is also relativel highly paid with an average of 

In [None]:
pd.DataFrame(df['position'].value_counts()).T
mean_by_position = pd.DataFrame(df.groupby(by='position').mean()['salary'].sort_values(ascending=False))
median_by_position = pd.DataFrame(df.groupby(by='position').median()['salary'].sort_values(ascending=False))
count_by_position = pd.DataFrame(df.groupby(by='position').count()['salary'].sort_values(ascending=False))
salary_by_position = mean_by_position.merge(how='outer', left_index=True, right_index=True, right=median_by_position['salary'])
salary_by_position = salary_by_position.merge(how='outer', left_index=True, right_index=True, right=count_by_position['salary'])
salary_by_position.rename({'salary_x': 'mean', 'salary_y': 'median', 'salary': 'count'}, axis='columns', inplace=True)
salary_by_position

In [None]:
salary_by_position.plot(figsize=(15,5))
plt.show()

In [None]:
f, axes = plt.subplots(1,1,figsize=(15,5))
sns.countplot(data=df, x='position')
plt.show()

In [None]:
f, axes = plt.subplots(1,1,figsize=(15,5))

sns.violinplot(data=df, x='position', y='salary')
plt.show()