In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_nba = pd.read_csv("../datasets/nba.csv")

In [3]:
df_nba.shape

(458, 9)

In [4]:
df_nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [5]:
df_nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [6]:
df_nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [7]:
df_nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [8]:
df_nba.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [9]:
df_nba.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [10]:
df_nba.dropna(subset=["Name"], inplace=True)

In [11]:
df_nba.isnull().sum()

Name         0
Team         0
Number       0
Position     0
Age          0
Height       0
Weight       0
College     84
Salary      11
dtype: int64

In [12]:
mean = np.mean(df_nba["Salary"])
df_nba["Salary"] = df_nba["Salary"].fillna(mean).astype(int)

In [13]:
df_nba.isnull().sum()

Name         0
Team         0
Number       0
Position     0
Age          0
Height       0
Weight       0
College     84
Salary       0
dtype: int64

In [14]:
df_nba["College"] = df_nba["College"].ffill()

In [15]:
df_nba.isnull().sum()

Name        0
Team        0
Number      0
Position    0
Age         0
Height      0
Weight      0
College     0
Salary      0
dtype: int64

In [16]:
df_nba["Height"]

0       6-2
1       6-6
2       6-5
3       6-5
4      6-10
       ... 
452    6-10
453     6-3
454     6-1
455     7-3
456     7-0
Name: Height, Length: 457, dtype: object

In [17]:
def convert_to_cm(height):
    if pd.isnull(height):
        return np.nan
    feet, inch = height.split("-")
    return int(int(feet) * 30.48 + int(inch) * 2.54)

In [18]:
df_nba["Height"] = df_nba["Height"].apply(convert_to_cm)

In [19]:
df_nba["Height"]

0      187
1      198
2      195
3      195
4      208
      ... 
452    208
453    190
454    185
455    220
456    213
Name: Height, Length: 457, dtype: int64

In [20]:
numeric_cols = df_nba.select_dtypes(include=["float64"]).columns
for col in numeric_cols:
    df_nba[col] = df_nba[col].astype(int)

In [21]:
df_nba.dtypes

Name        object
Team        object
Number       int64
Position    object
Age          int64
Height       int64
Weight       int64
College     object
Salary       int64
dtype: object

In [22]:
object_cols = df_nba.select_dtypes(include=["object"]).columns
for col in object_cols:
    df_nba[col] = df_nba[col].astype("category")

In [23]:
df_nba.dtypes

Name        category
Team        category
Number         int64
Position    category
Age            int64
Height         int64
Weight         int64
College     category
Salary         int64
dtype: object

In [24]:
# df_nba = pd.get_dummies(
#     df_nba,
#     columns=object_cols,
# ).astype(int)

In [25]:
df_nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0,PG,25,187,180,Texas,7730337
1,Jae Crowder,Boston Celtics,99,SF,25,198,235,Marquette,6796117
2,John Holland,Boston Celtics,30,SG,27,195,205,Boston University,4842684
3,R.J. Hunter,Boston Celtics,28,SG,22,195,185,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8,PF,29,208,231,Georgia State,5000000
...,...,...,...,...,...,...,...,...,...
452,Trey Lyles,Utah Jazz,41,PF,20,208,234,Kentucky,2239800
453,Shelvin Mack,Utah Jazz,8,PG,26,190,203,Butler,2433333
454,Raul Neto,Utah Jazz,25,PG,24,185,179,Butler,900000
455,Tibor Pleiss,Utah Jazz,21,C,26,220,256,Butler,2900000


In [26]:
print(list(df_nba.columns))

['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight', 'College', 'Salary']


In [46]:
age_values = df_nba["Age"].value_counts().reset_index()
age_values

Unnamed: 0,Age,count
0,24,47
1,25,45
2,27,41
3,23,41
4,26,36
5,28,31
6,30,31
7,29,28
8,22,26
9,31,22


In [28]:
age_mean = pd.DataFrame(
    df_nba.groupby("Age", observed=False)["Salary"].mean()
).reset_index()

age_mean.columns = ["Age", "Mean"]

age_median = pd.DataFrame(
    df_nba.groupby("Age", observed=False)["Salary"].median()
).reset_index()

age_median.columns = ["Age", "Median"]

age_mode = (
    df_nba.groupby("Age", observed=False)["Salary"]
    .apply(lambda x: x.mode().mean())
    .reset_index()
)
age_mode.columns = ["Age", "Mode"]

age_std = pd.DataFrame(
    df_nba.groupby("Age", observed=False)["Salary"].std()
).reset_index()

age_std.columns = ["Age", "Standard Deviation"]

age_df = (
    age_mean.merge(age_median, on="Age")
    .merge(age_mode, on="Age")
    .merge(age_std, on="Age")
)
np.round(age_df, 2)

Unnamed: 0,Age,Mean,Median,Mode,Standard Deviation
0,19,1930440.0,1930440.0,1930440.0,279165.76
1,20,2725790.84,2481720.0,525093.0,1510913.43
2,21,2067379.63,1584480.0,2067379.63,1412350.36
3,22,2357963.46,1793880.0,2357963.46,1517378.33
4,23,2171718.8,1201440.0,845059.0,2728808.16
5,24,3830295.36,1535880.0,845059.0,4702753.1
6,25,3951130.0,1358880.0,845059.0,4508413.97
7,26,6866566.25,3325000.0,947276.0,6100470.79
8,27,6632008.07,3425510.0,6269301.75,6752121.84
9,28,5101559.42,4389607.0,1100602.0,4244345.0


In [29]:
position_mean = pd.DataFrame(
    df_nba.groupby("Position", observed=False)["Salary"].mean()
).reset_index()
position_mean.columns = ["Position", "Mean"]

position_median = pd.DataFrame(
    df_nba.groupby("Position", observed=False)["Salary"].median()
).reset_index()
position_median.columns = ["Position", "Mean"]

position_mode = pd.DataFrame(
    df_nba.groupby("Position", observed=False)["Salary"].apply(
        lambda x: x.mode().mean()
    )
).reset_index()
position_mode.columns = ["Position", "Mode"]

position_std = pd.DataFrame(
    df_nba.groupby("Position", observed=False)["Salary"].std()
).reset_index()

position_std.columns = ["Position", "Standard Deviation"]

position_df = (
    position_mean.merge(position_median, on="Position")
    .merge(position_mode, on="Position")
    .merge(position_std, on="Position")
)
np.round(position_df, 2)

Unnamed: 0,Position,Mean_x,Mean_y,Mode,Standard Deviation
0,C,5967052.0,3811060.0,947276.0,5787988.91
1,PF,4570889.02,3000000.0,896167.5,4727010.29
2,PG,5067605.51,3106518.0,2894980.0,4939767.97
3,SF,4857219.53,2041080.0,845059.0,5975996.94
4,SG,4034355.98,2394102.5,947276.0,4426657.96


In [30]:
team_mean = pd.DataFrame(
    df_nba.groupby("Team", observed=False)["Salary"].mean()
).reset_index()
team_mean.columns = ["Team", "Mean"]

team_median = pd.DataFrame(
    df_nba.groupby("Team", observed=False)["Salary"].median()
).reset_index()
team_median.columns = ["Team", "Mean"]

team_mode = pd.DataFrame(
    df_nba.groupby("Team", observed=False)["Salary"].apply(lambda x: x.mode().mean())
).reset_index()
team_mode.columns = ["Team", "Mode"]

team_std = pd.DataFrame(
    df_nba.groupby("Team", observed=False)["Salary"].std()
).reset_index()

team_std.columns = ["Team", "Standard Deviation"]

team_df = (
    team_mean.merge(team_median, on="Team")
    .merge(team_mode, on="Team")
    .merge(team_std, on="Team")
)
np.round(team_df, 2)

Unnamed: 0,Team,Mean_x,Mean_y,Mode,Standard Deviation
0,Atlanta Hawks,4860196.67,2854940.0,1000000.0,5194508.14
1,Boston Celtics,4225583.47,3425510.0,4225583.47,3036395.96
2,Brooklyn Nets,3501898.33,1335480.0,947276.0,5317816.85
3,Charlotte Hornets,5222728.0,4204200.0,947276.0,4538601.37
4,Chicago Bulls,5785558.53,2380440.0,5785558.53,6251087.81
5,Cleveland Cavaliers,7455424.87,4950000.0,947276.0,7484115.8
6,Dallas Mavericks,4746582.13,3950313.0,4746582.13,5030279.04
7,Denver Nuggets,4330974.27,3000000.0,4330974.27,4165468.08
8,Detroit Pistons,4477884.2,2891760.0,4477884.2,4668478.27
9,Golden State Warriors,5924599.8,3815000.0,5924599.8,5664281.67


# Iris Dataset


In [31]:
df = pd.read_csv("../datasets/iris.csv")

In [32]:
df.shape

(150, 5)

In [33]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [34]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [35]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [37]:
df.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [38]:
df["species"] = df["species"].astype("category")

In [39]:
df.dtypes

sepal_length     float64
sepal_width      float64
petal_length     float64
petal_width      float64
species         category
dtype: object

In [40]:
df["species"].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [41]:
pd.DataFrame(df.groupby("species", observed=False).describe()).reset_index()

Unnamed: 0_level_0,species,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_length,sepal_width,...,petal_length,petal_length,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width,petal_width
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,...,75%,max,count,mean,std,min,25%,50%,75%,max
0,setosa,50.0,5.006,0.35249,4.3,4.8,5.0,5.2,5.8,50.0,...,1.575,1.9,50.0,0.244,0.10721,0.1,0.2,0.2,0.3,0.6
1,versicolor,50.0,5.936,0.516171,4.9,5.6,5.9,6.3,7.0,50.0,...,4.6,5.1,50.0,1.326,0.197753,1.0,1.2,1.3,1.5,1.8
2,virginica,50.0,6.588,0.63588,4.9,6.225,6.5,6.9,7.9,50.0,...,5.875,6.9,50.0,2.026,0.27465,1.4,1.8,2.0,2.3,2.5


In [42]:
species_list = df["species"].unique()
features = list(df.select_dtypes(include=["float64"]).columns)

In [43]:
pd.DataFrame(species_list, columns=["Species"])

Unnamed: 0,Species
0,setosa
1,versicolor
2,virginica


In [44]:
stats_df = pd.DataFrame()

for species in species_list:
    species_df = df[df["species"] == species]

    for feature in features:
        data = species_df[feature]

        mean = np.mean(data)
        median = np.median(data)
        mode = data.mode().mean()
        std = np.std(data)

        min_val = np.min(data)
        max_val = np.max(data)

        p10 = np.percentile(data, 10)
        q1 = np.quantile(data, 0.25)
        p50 = np.percentile(data, 50)
        q3 = np.quantile(data, 0.75)
        p90 = np.percentile(data, 90)

        new_row = pd.DataFrame(
            {
                "species": [species],
                "feature": [feature],
                "min": [min_val],
                "max": [max_val],
                "range": [max_val - min_val],
                "mean": [mean],
                "median": [median],
                "mode": [mode],
                "std": [std],
                "10%": [p10],
                "25%": [q1],
                "50%": [p50],
                "75%": [q3],
                "90%": [p90],
                "iqr": [q3 - q1],
            }
        )

        stats_df = pd.concat([stats_df, new_row], axis=0, ignore_index=True)

In [45]:
np.round(stats_df, 2)

Unnamed: 0,species,feature,min,max,range,mean,median,mode,std,10%,25%,50%,75%,90%,iqr
0,setosa,sepal_length,4.3,5.8,1.5,5.01,5.0,5.05,0.35,4.59,4.8,5.0,5.2,5.41,0.4
1,setosa,sepal_width,2.3,4.4,2.1,3.42,3.4,3.4,0.38,3.0,3.12,3.4,3.68,3.9,0.55
2,setosa,petal_length,1.0,1.9,0.9,1.46,1.5,1.5,0.17,1.3,1.4,1.5,1.58,1.7,0.18
3,setosa,petal_width,0.1,0.6,0.5,0.24,0.2,0.2,0.11,0.1,0.2,0.2,0.3,0.4,0.1
4,versicolor,sepal_length,4.9,7.0,2.1,5.94,5.9,5.6,0.51,5.38,5.6,5.9,6.3,6.7,0.7
5,versicolor,sepal_width,2.0,3.4,1.4,2.77,2.8,3.0,0.31,2.3,2.52,2.8,3.0,3.11,0.48
6,versicolor,petal_length,3.0,5.1,2.1,4.26,4.35,4.5,0.47,3.59,4.0,4.35,4.6,4.8,0.6
7,versicolor,petal_width,1.0,1.8,0.8,1.33,1.3,1.3,0.2,1.0,1.2,1.3,1.5,1.51,0.3
8,virginica,sepal_length,4.9,7.9,3.0,6.59,6.5,6.3,0.63,5.8,6.22,6.5,6.9,7.61,0.68
9,virginica,sepal_width,2.2,3.8,1.6,2.97,3.0,3.0,0.32,2.59,2.8,3.0,3.18,3.31,0.38
