In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as sts

In [76]:
ds = pd.read_csv('nba.csv')
ds.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [77]:
ds.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [78]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [79]:
ds['Name']=ds['Name'].astype('string')
ds['Team']=ds['Team'].astype('string')
ds['College']=ds['College'].astype('string')
ds['Position']=ds['Position'].astype('string')

In [80]:
ds.dtypes

Name        string[python]
Team        string[python]
Number             float64
Position    string[python]
Age                float64
Height              object
Weight             float64
College     string[python]
Salary             float64
dtype: object

In [81]:
rows = ds.loc[ds['Age'].isna()] #return row with null value
print(rows)


     Name  Team  Number Position  Age Height  Weight College  Salary
457  <NA>  <NA>     NaN     <NA>  NaN    NaN     NaN    <NA>     NaN


In [82]:
ds.describe()

Unnamed: 0,Number,Age,Weight,Salary
count,457.0,457.0,457.0,446.0
mean,17.678337,26.938731,221.522976,4842684.0
std,15.96609,4.404016,26.368343,5229238.0
min,0.0,19.0,161.0,30888.0
25%,5.0,24.0,200.0,1044792.0
50%,13.0,26.0,220.0,2839073.0
75%,25.0,30.0,240.0,6500000.0
max,99.0,40.0,307.0,25000000.0


In [83]:
#since all values are null delete the row
ds.dropna(subset="Age",inplace=True)

In [84]:
ds.isnull().sum()

Name         0
Team         0
Number       0
Position     0
Age          0
Height       0
Weight       0
College     84
Salary      11
dtype: int64

In [85]:
ds['Age']=ds['Age'].astype(int) #convertable only when there is no null value

In [86]:
ds.dtypes

Name        string[python]
Team        string[python]
Number             float64
Position    string[python]
Age                  int64
Height              object
Weight             float64
College     string[python]
Salary             float64
dtype: object

In [87]:
#replace college unknow since string dtype cannot take mode
ds['College'].fillna('Unknown',inplace=True)
ds['Salary']=ds['Salary'].fillna(ds['Salary'].mean()) #replace NaN salary with mean

In [88]:
ds.isnull().sum()

Name        0
Team        0
Number      0
Position    0
Age         0
Height      0
Weight      0
College     0
Salary      0
dtype: int64

In [89]:
mean_age = ds['Age'].mean()
mode_age = ds['Age'].mode()
median_age = ds['Age'].median()
sd_age = sts.stdev(ds['Age'])
max_age = ds['Age'].max()
min_age = ds['Age'].min()
count_age = ds['Age'].value_counts()
print('Mean of age: ',mean_age)
print('Mode of age: ',mode_age)
print('Median of age: ',median_age)
print('Standard Deviation: ',sd_age)
print('Max: ',max_age)
print('Min: ',min_age)
print('Value Count: ',count_age)

Mean of age:  26.938730853391686
Mode of age:  0    24
Name: Age, dtype: int64
Median of age:  26.0
Standard Deviation:  4.404016424405833
Max:  40
Min:  19
Value Count:  Age
24    47
25    45
27    41
23    41
26    36
28    31
30    31
29    28
22    26
31    22
20    19
21    19
33    14
32    13
34    10
36    10
35     9
37     4
38     4
40     3
39     2
19     2
Name: count, dtype: int64


In [90]:
#require to give range for bins
ds['Age'].min() #min

19

In [91]:
ds['Age'].max() #max 

40

In [92]:
#map categorical data with quantitative for exampla age 19-20 -> their salary mean, median mode
bins = [19,25,31,36,40]  #edge of the age range
labels = ["19-24","25-30","31-35","36-40"] #name given to each range
ds["Age_Group"] = pd.cut(ds["Age"],bins=bins,labels=labels) #takes each value in ds["Age"] checks in which bin it falls and replace number with its range

In [93]:
ds["Age_Group"].value_counts() 

Age_Group
19-24    197
25-30    189
31-35     56
36-40     13
Name: count, dtype: int64

In [63]:
# group by age and show statistical data in context to salary 
group_age = ds.groupby(ds['Age_Group']) 
group_age['Salary'].describe()

  group_age = ds.groupby(ds['Age_Group'])


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Age_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
19-24,197.0,3041840.0,3552554.0,30888.0,947276.0,1662360.0,3533333.0,16407501.0
25-30,189.0,6635459.0,5825556.0,55722.0,1500000.0,4842684.0,10151612.0,22970500.0
31-35,56.0,5113016.0,5283815.0,200600.0,1323709.25,3646250.0,6350000.0,22875000.0
36-40,13.0,5351744.0,6508388.0,222888.0,947726.0,4088019.0,5250000.0,25000000.0


In [54]:
ds["Age_Group"].describe()

count       455
unique        4
top       19-24
freq        197
Name: Age_Group, dtype: object

In [46]:
min_sal = ds['Salary'].min()
max_sal = ds['Salary'].max()
print("Min Sal: ",min_sal)
print("Max_Sal: ",max_sal)

Min Sal:  30888.0
Max_Sal:  25000000.0


In [51]:
salary_bin = [0, 500000, 1000000, 1500000, 2000000, 2500000]
salary_labels = ["Below 5L",'5L-10L','10L-15L','15L-20L','20L-25L']
ds['Salary_Group'] = pd.cut(ds['Salary'],bins=salary_bin,labels=salary_labels)

In [52]:
ds['Salary_Group'].value_counts()

Salary_Group
5L-10L      81
10L-15L     53
20L-25L     30
Below 5L    25
15L-20L     22
Name: count, dtype: int64

In [65]:
salary_group = ds.groupby("Salary_Group")
salary_group["Age"].describe()

  salary_group = ds.groupby("Salary_Group")


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Salary_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Below 5L,25.0,28.16,4.606879,23.0,25.0,27.0,29.0,40.0
5L-10L,81.0,26.37037,4.385899,20.0,24.0,25.0,28.0,39.0
10L-15L,53.0,24.962264,3.61068,20.0,22.0,24.0,27.0,36.0
15L-20L,22.0,23.0,3.147183,19.0,21.0,22.0,24.0,33.0
20L-25L,30.0,25.833333,4.646714,19.0,22.25,25.0,28.75,36.0


In [73]:
#group age and salary 
group_sal = ds.groupby(['Age','Salary'])
print(group_sal.first())
group_sal['Age'].describe()

                          Name                    Team  Number Position  \
Age  Salary                                                               
19.0 1733040.0   Rashad Vaughn         Milwaukee Bucks    20.0       SG   
     2127840.0    Devin Booker            Phoenix Suns     1.0       SG   
20.0 525093.0   Christian Wood      Philadelphia 76ers    35.0       PF   
     1131960.0    Kevon Looney   Golden State Warriors    36.0       SF   
     1282080.0      Tyus Jones  Minnesota Timberwolves     1.0       PG   
...                        ...                     ...     ...      ...   
39.0 947726.0   Pablo Prigioni    Los Angeles Clippers     9.0       PG   
     4088019.0    Vince Carter       Memphis Grizzlies    15.0       SG   
40.0 250750.0     Andre Miller       San Antonio Spurs    24.0       PG   
     5250000.0      Tim Duncan       San Antonio Spurs    21.0        C   
     8500000.0   Kevin Garnett  Minnesota Timberwolves    21.0       PF   

               Height  W

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Age,Salary,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
19.0,1733040.0,1.0,19.0,,19.0,19.0,19.0,19.0,19.0
19.0,2127840.0,1.0,19.0,,19.0,19.0,19.0,19.0,19.0
20.0,525093.0,2.0,20.0,0.0,20.0,20.0,20.0,20.0,20.0
20.0,1131960.0,1.0,20.0,,20.0,20.0,20.0,20.0,20.0
20.0,1282080.0,1.0,20.0,,20.0,20.0,20.0,20.0,20.0
...,...,...,...,...,...,...,...,...,...
39.0,947726.0,1.0,39.0,,39.0,39.0,39.0,39.0,39.0
39.0,4088019.0,1.0,39.0,,39.0,39.0,39.0,39.0,39.0
40.0,250750.0,1.0,40.0,,40.0,40.0,40.0,40.0,40.0
40.0,5250000.0,1.0,40.0,,40.0,40.0,40.0,40.0,40.0


In [17]:
from sklearn.datasets import load_iris
iris = load_iris()
print(iris)

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [18]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [19]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [20]:
iris['data']

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [21]:
iris['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [22]:
iris['target']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])