# DataFrame I

In [1]:
import numpy as np
import pandas as pd

In [2]:
pokemons = pd.read_csv('datasets/pokemon.csv', index_col = [0]).squeeze(True)
pokemons

Name
Bulbasaur          Grass, Poison
Ivysaur            Grass, Poison
Venusaur           Grass, Poison
Charmander                  Fire
Charmeleon                  Fire
                      ...       
Iron Valiant     Fairy, Fighting
Koraidon        Fighting, Dragon
Miraidon        Electric, Dragon
Walking Wake       Water, Dragon
Iron Leaves       Grass, Psychic
Name: Type, Length: 1010, dtype: object

In [3]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


#### some methds and attributes are common in series and dataframes.

In [4]:
print(pokemons.shape)
print(nba.shape)

(1010,)
(592, 7)


In [5]:
print(pokemons.size)
print(nba.size)

1010
4144


In [6]:
print(pokemons.ndim)
print(nba.ndim)

1
2


In [7]:
print(pokemons.dtype)
print('*' * 20)
print(nba.dtypes) # Note that here we have 'dtypes'

object
********************
Name         object
Team         object
Position     object
Height       object
Weight      float64
College      object
Salary      float64
dtype: object


In [8]:
print(pokemons.index)
print('*' * 100)
print(nba.index)

Index(['Bulbasaur', 'Ivysaur', 'Venusaur', 'Charmander', 'Charmeleon',
       'Charizard', 'Squirtle', 'Wartortle', 'Blastoise', 'Caterpie',
       ...
       'Wo-Chien', 'Chien-Pao', 'Ting-Lu', 'Chi-Yu', 'Roaring Moon',
       'Iron Valiant', 'Koraidon', 'Miraidon', 'Walking Wake', 'Iron Leaves'],
      dtype='object', name='Name', length=1010)
****************************************************************************************************
RangeIndex(start=0, stop=592, step=1)


#### some attributes are present only in dataframes

In [9]:
try:
    pokemons.columns
except AttributeError as ex:
    print(ex)

'Series' object has no attribute 'columns'


In [10]:
# pokemons.columns # error
nba.columns

Index(['Name', 'Team', 'Position', 'Height', 'Weight', 'College', 'Salary'], dtype='object')

#### some attributes are present only in series.

In [11]:
pokemons.hasnans

False

In [12]:
try:
    nba.hasnans
except AttributeError as ex:
    print(ex)

'DataFrame' object has no attribute 'hasnans'


### Then we have some methods which are same in series and dataframes.

In [13]:
nba.head()

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0


In [14]:
nba.tail()

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0
591,,,,,,,


In [15]:
nba.head(10).tail(5)

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
5,Trent Forrest,Atlanta Hawks,G,6-4,210.0,Florida State,508891.0
6,AJ Griffin,Atlanta Hawks,F,6-6,220.0,Duke,3712920.0
7,Mouhamed Gueye,Atlanta Hawks,F,6-11,210.0,Washington State,1119563.0
8,De'Andre Hunter,Atlanta Hawks,F-G,6-8,221.0,Virginia,20089286.0
9,Jalen Johnson,Atlanta Hawks,F,6-8,219.0,Duke,2925360.0


In [16]:
nba.describe(include='all')

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
count,591,591,584,585,584.0,578,488.0
unique,591,30,7,20,,182,
top,Saddiq Bey,Dallas Mavericks,G,6-5,,Kentucky,
freq,1,23,229,74,,29,
mean,,,,,214.763699,,9218978.0
std,,,,,23.460612,,11319270.0
min,,,,,160.0,,508891.0
25%,,,,,198.0,,1980599.0
50%,,,,,215.0,,4018638.0
75%,,,,,230.0,,11696940.0


In [17]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      591 non-null    object 
 1   Team      591 non-null    object 
 2   Position  584 non-null    object 
 3   Height    585 non-null    object 
 4   Weight    584 non-null    float64
 5   College   578 non-null    object 
 6   Salary    488 non-null    float64
dtypes: float64(2), object(5)
memory usage: 32.5+ KB


# Selecting a column from DataFrame

In [18]:
import numpy as np
import pandas as pd

In [19]:
nba = pd.read_csv('datasets/nba.csv')

In [20]:
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [21]:
nba.Team

0           Atlanta Hawks
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

### However we cannot use the dot accessor if the column name has spaces.

    nba.Team Player XXXXX
    So, in this case we can use the square bracket notation which takes in the name of the column in the form of a string.
    nba['Team Player']

In [22]:
nba['Team']

0           Atlanta Hawks
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

#### One thing to notice is that when we take a column from a dataframe, we are being given a view which means that if we modify a value from the column which was isolated then the original dataframe would also be mutatated.

In [23]:
team = nba['Team']
team

0           Atlanta Hawks
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

In [24]:
team[0] = 'Whatever'
team  # we modified the first value of the team.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  team[0] = 'Whatever'


0                Whatever
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

    The above operation would also change the nba dataframe as the piece which we got in return was a view connected to the dataframe.

In [25]:
nba # Take a look at the first value of the Team column. It has been modified mistakenly.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Whatever,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


### The solution to this problem is that we create a copy of the column whenever we try to isolate a column from a dataframe.

In [26]:
teams = nba['Team'].copy()

In [27]:
teams

0                Whatever
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

### Now any modification which we do on this series would not be reflected in the original dataframe.

In [28]:
teams[0] = 'Atlanta Hawks'
teams

0           Atlanta Hawks
1           Atlanta Hawks
2           Atlanta Hawks
3           Atlanta Hawks
4           Atlanta Hawks
              ...        
587    Washington Wizards
588    Washington Wizards
589    Washington Wizards
590    Washington Wizards
591                   NaN
Name: Team, Length: 592, dtype: object

In [29]:
nba # This time the first value of the team column did not get modified as we created a column while isolating the Team column.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Whatever,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


# Selecting multiple columns from a dataframe
    There is a slight difference between when we select multiple columns from a dataframe using fancy indexing and when we
    select a single columns from a dataframe.
    
    When we select multiple columns from a dataframe we get a copy and not a view and when we select a single column from a 
    dataframe then we get a view.

In [30]:
import numpy as np
import pandas as pd

In [31]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [32]:
nba[['Team', 'College']]

Unnamed: 0,Team,College
0,Atlanta Hawks,Villanova
1,Atlanta Hawks,Fenerbahce
2,Atlanta Hawks,Michigan
3,Atlanta Hawks,Elan Chalon
4,Atlanta Hawks,Maryland
...,...,...
587,Washington Wizards,Toledo
588,Washington Wizards,Wichita State
589,Washington Wizards,Real Madrid
590,Washington Wizards,Utah


    Note that the order in which we pass the columns name inside the list also matters as pandas will pick the columns in 
    that order only.

In [33]:
nba[['College', 'Team']]

Unnamed: 0,College,Team
0,Villanova,Atlanta Hawks
1,Fenerbahce,Atlanta Hawks
2,Michigan,Atlanta Hawks
3,Elan Chalon,Atlanta Hawks
4,Maryland,Atlanta Hawks
...,...,...
587,Toledo,Washington Wizards
588,Wichita State,Washington Wizards
589,Real Madrid,Washington Wizards
590,Utah,Washington Wizards


# Adding new columns

In [34]:
import numpy as np
import pandas as pd

In [35]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [36]:
nba['Salary Doubled'] = nba['Salary'] * 2
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary,Salary Doubled
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0,9113966.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0,37400000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0,8188488.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0,41232000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0,5163044.0
...,...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0,3439728.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0,20500000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0,16390244.0


In [37]:
nba['Sport'] = 'Basketball'
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary,Salary Doubled,Sport
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0,9113966.0,Basketball
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0,37400000.0,Basketball
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0,8188488.0,Basketball
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0,41232000.0,Basketball
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0,5163044.0,Basketball
...,...,...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0,3439728.0,Basketball
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0,20500000.0,Basketball
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,,,Basketball
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0,16390244.0,Basketball


In [38]:
nba.insert(loc = 1, column = 'Color', value = 'Yellow')
nba

Unnamed: 0,Name,Color,Team,Position,Height,Weight,College,Salary,Salary Doubled,Sport
0,Saddiq Bey,Yellow,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0,9113966.0,Basketball
1,Bogdan Bogdanovic,Yellow,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0,37400000.0,Basketball
2,Kobe Bufkin,Yellow,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0,8188488.0,Basketball
3,Clint Capela,Yellow,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0,41232000.0,Basketball
4,Bruno Fernando,Yellow,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0,5163044.0,Basketball
...,...,...,...,...,...,...,...,...,...,...
587,Ryan Rollins,Yellow,Washington Wizards,G,6-3,180.0,Toledo,1719864.0,3439728.0,Basketball
588,Landry Shamet,Yellow,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0,20500000.0,Basketball
589,Tristan Vukcevic,Yellow,Washington Wizards,F,6-10,220.0,Real Madrid,,,Basketball
590,Delon Wright,Yellow,Washington Wizards,G,6-5,185.0,Utah,8195122.0,16390244.0,Basketball


# Example 2

In [39]:
pokemon = pd.read_csv('datasets/pokemon.csv')
pokemon

Unnamed: 0,Name,Type
0,Bulbasaur,"Grass, Poison"
1,Ivysaur,"Grass, Poison"
2,Venusaur,"Grass, Poison"
3,Charmander,Fire
4,Charmeleon,Fire
...,...,...
1005,Iron Valiant,"Fairy, Fighting"
1006,Koraidon,"Fighting, Dragon"
1007,Miraidon,"Electric, Dragon"
1008,Walking Wake,"Water, Dragon"


In [40]:
pokemon['name_length'] = pokemon['Name'].apply(lambda name : len(name))
pokemon

Unnamed: 0,Name,Type,name_length
0,Bulbasaur,"Grass, Poison",9
1,Ivysaur,"Grass, Poison",7
2,Venusaur,"Grass, Poison",8
3,Charmander,Fire,10
4,Charmeleon,Fire,10
...,...,...,...
1005,Iron Valiant,"Fairy, Fighting",12
1006,Koraidon,"Fighting, Dragon",8
1007,Miraidon,"Electric, Dragon",8
1008,Walking Wake,"Water, Dragon",12


# value_counts() method for dataframes
    The value_counts() is used for a single column and not for the entire dataframe. 

In [41]:
import numpy as np
import pandas as pd

In [42]:
pokemon = pd.read_csv('datasets/pokemon.csv')
pokemon

Unnamed: 0,Name,Type
0,Bulbasaur,"Grass, Poison"
1,Ivysaur,"Grass, Poison"
2,Venusaur,"Grass, Poison"
3,Charmander,Fire
4,Charmeleon,Fire
...,...,...
1005,Iron Valiant,"Fairy, Fighting"
1006,Koraidon,"Fighting, Dragon"
1007,Miraidon,"Electric, Dragon"
1008,Walking Wake,"Water, Dragon"


In [43]:
names = pokemon['Name'].copy()
names

0          Bulbasaur
1            Ivysaur
2           Venusaur
3         Charmander
4         Charmeleon
            ...     
1005    Iron Valiant
1006        Koraidon
1007        Miraidon
1008    Walking Wake
1009     Iron Leaves
Name: Name, Length: 1010, dtype: object

In [44]:
names.value_counts()

Name
Bulbasaur      1
Honedge        1
Vivillon       1
Litleo         1
Pyroar         1
              ..
Crawdaunt      1
Baltoy         1
Claydol        1
Lileep         1
Iron Leaves    1
Name: count, Length: 1010, dtype: int64

    1. By default, the value_counts() method does not include the NaN values.
    
    2. But we can also included the NaN values frequency to be counted by setting the dropna parameter to False.
    
    3. The default value of dropna is set to be True.

In [45]:
names.value_counts(dropna = False) # This series did not have any NaN values.

Name
Bulbasaur      1
Honedge        1
Vivillon       1
Litleo         1
Pyroar         1
              ..
Crawdaunt      1
Baltoy         1
Claydol        1
Lileep         1
Iron Leaves    1
Name: count, Length: 1010, dtype: int64

In [46]:
positions = pd.read_csv('datasets/nba.csv', usecols = ['Position']).squeeze(True)
positions

0        F
1        G
2        G
3        C
4      F-C
      ... 
587      G
588      G
589      F
590      G
591    NaN
Name: Position, Length: 592, dtype: object

In [47]:
positions.value_counts() # This does not count the frequency of NaN values

Position
G      229
F      187
C       47
G-F     46
F-C     37
C-F     23
F-G     15
Name: count, dtype: int64

In [48]:
positions.value_counts(dropna = False)
# When we set the value of dropna to be False, then the frequency of dropna would also be counted.

Position
G      229
F      187
C       47
G-F     46
F-C     37
C-F     23
F-G     15
NaN      8
Name: count, dtype: int64

# dropna() method
    1. The dropna() method is used to remove the null values.
    
    2. By default it removes all the rows which have even a single NaN value.
    
    3. However, we can change this behaviour by using the 'how' parameter.
    
    4. If we set the value of how = 'all' then only those rows will be removed which have all the NaN values.
    
    5. By default, the value of how is set to 'any'.
    
    6. The dropna() method by default returns us a completely new dataframe. If we want it to modify the original dataframe
    then we can store it in the same variable on which we performed dropna() on. 

In [49]:
import numpy as np
import pandas as pd

In [50]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [51]:
nba.dropna() # This would return us a new dataframe in which every row which has even a single NaN value will be removed.
nba.dropna(how = 'any')

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
585,Eugene Omoruyi,Washington Wizards,F,6-6,235.0,Oregon,559782.0
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0


In [52]:
nba.dropna(how = 'all') # This will remove only those rows which has all the values as NaN.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,


    We also have a subset parameter which takes in a list of columns. This will remove only those rows which have a NaN 
    value in EITHER of those columns.

In [53]:
nba.dropna(subset = 'College') # This will remove all the rows where a NaN value is found in the College column.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,


In [54]:
nba.dropna(subset = ['College', 'Salary']) 
# This will remove all the rows where there is a NaN value in either 'College' or 'Salary' column. 

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
585,Eugene Omoruyi,Washington Wizards,F,6-6,235.0,Oregon,559782.0
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0


In [55]:
nba.dropna(subset = ['College', 'Salary'], how = 'all')

# This will remove all the rows where the value of both 'College' and 'Salary' is NaN.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,


In [56]:
nba.dropna() is nba

False

In [57]:
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [58]:
nba.dropna(how = 'all', inplace = True)

In [59]:
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
586,Jordan Poole,Washington Wizards,G,6-4,194.0,Michigan,27955357.0
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,


# fillna() method

    1. The fillna() is used to replace the NaN values in a dataframe with a given value. This gives us a new dataframe.
    
    2. If we want to mutate the original dataframe, then we set the inplace parameter to True.

In [60]:
import numpy as np
import pandas as pd

In [61]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [62]:
nba.fillna(value = 0) # This will fill all the NaN values in the with 0 and give us a new dataframe.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,0.0
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


    We can also isolate a particular column and replace NaN values in that column with a value of our own.

In [63]:
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [64]:
nba['College'] = nba['College'].fillna(value = 'unknown')

In [65]:
nba # Now all the NaN values in the College column has been replaced with 'unknown'.

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [66]:
nba['Salary'].fillna(value = 0, inplace = True)

In [67]:
nba # All the NaN values in the 'Salary' column have been replaced with 0

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,0.0
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


# Confusions

    1. hasnans attribute tells us if a columns has null values or not.
    
    2. is_unique attribute tells us if a column has unique values or not (null values are also considered a value).
    
    3. count() method tells us the number of non-null values in a series or column.
    
    4. size attribute tells us the length of the column.
    
    5. unique() method gives us a list of unique values (including null values).
    
    6. nunique() method gives us the total number of unique values (excluding null values), but we can include null values 
       in the answer by setting dropna = False.

In [68]:
import numpy as np
import pandas as pd

In [69]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [70]:
positions = nba['Position'].squeeze()
positions

0        F
1        G
2        G
3        C
4      F-C
      ... 
587      G
588      G
589      F
590      G
591    NaN
Name: Position, Length: 592, dtype: object

In [71]:
positions.size, positions.count() # This means that we have 8 null values.

(592, 584)

In [72]:
positions.value_counts(dropna = False)

Position
G      229
F      187
C       47
G-F     46
F-C     37
C-F     23
F-G     15
NaN      8
Name: count, dtype: int64

In [73]:
positions.hasnans # As it has null values

True

In [74]:
positions.unique()

array(['F', 'G', 'C', 'F-C', 'F-G', nan, 'G-F', 'C-F'], dtype=object)

In [75]:
positions.nunique(dropna = False)

8

# astype() method

    1. The astype() method is used to change the data type of a column. This is done in order to reduce the size of the 
    dataframe.
    
    2. This method returns us a new series which has a changed data type.
    
    3. Before using the astype method on a numeric column, we need to make sure that the column does not have any null 
    values.

In [1]:
import numpy as np
import pandas as pd

In [3]:
nba = pd.read_csv('datasets/nba.csv')
nba

Unnamed: 0,Name,Team,Position,Height,Weight,College,Salary
0,Saddiq Bey,Atlanta Hawks,F,6-7,215.0,Villanova,4556983.0
1,Bogdan Bogdanovic,Atlanta Hawks,G,6-5,225.0,Fenerbahce,18700000.0
2,Kobe Bufkin,Atlanta Hawks,G,6-5,195.0,Michigan,4094244.0
3,Clint Capela,Atlanta Hawks,C,6-10,256.0,Elan Chalon,20616000.0
4,Bruno Fernando,Atlanta Hawks,F-C,6-10,240.0,Maryland,2581522.0
...,...,...,...,...,...,...,...
587,Ryan Rollins,Washington Wizards,G,6-3,180.0,Toledo,1719864.0
588,Landry Shamet,Washington Wizards,G,6-4,190.0,Wichita State,10250000.0
589,Tristan Vukcevic,Washington Wizards,F,6-10,220.0,Real Madrid,
590,Delon Wright,Washington Wizards,G,6-5,185.0,Utah,8195122.0


In [4]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      591 non-null    object 
 1   Team      591 non-null    object 
 2   Position  584 non-null    object 
 3   Height    585 non-null    object 
 4   Weight    584 non-null    float64
 5   College   578 non-null    object 
 6   Salary    488 non-null    float64
dtypes: float64(2), object(5)
memory usage: 32.5+ KB


In [11]:
from pandas.errors import IntCastingNaNError

try:
    nba['Salary'].astype('int')
except IntCastingNaNError as ex:
    print(ex)

Cannot convert non-finite values (NA or inf) to integer


    The reason why the above error occured is because the Salary column had null values in it.
    
    So in order to apply the astype() method, we first need to either remove the null values or replace them.
    
    We choose to replace them since we don't want to reduce the size of the column.

In [14]:
nba['Salary'] = nba['Salary'].fillna(value = 0)
nba['Salary']

0       4556983.0
1      18700000.0
2       4094244.0
3      20616000.0
4       2581522.0
          ...    
587     1719864.0
588    10250000.0
589           0.0
590     8195122.0
591           0.0
Name: Salary, Length: 592, dtype: float64

In [17]:
nba['Salary'].astype('int')
nba['Salary'] = nba['Salary'].astype(int)

In [18]:
nba['Salary']

0       4556983
1      18700000
2       4094244
3      20616000
4       2581522
         ...   
587     1719864
588    10250000
589           0
590     8195122
591           0
Name: Salary, Length: 592, dtype: int32

In [19]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      591 non-null    object 
 1   Team      591 non-null    object 
 2   Position  584 non-null    object 
 3   Height    585 non-null    object 
 4   Weight    584 non-null    float64
 5   College   578 non-null    object 
 6   Salary    592 non-null    int32  
dtypes: float64(1), int32(1), object(5)
memory usage: 30.2+ KB


    similarily, we can change the data type of the Weight.

In [20]:
nba['Weight']

0      215.0
1      225.0
2      195.0
3      256.0
4      240.0
       ...  
587    180.0
588    190.0
589    220.0
590    185.0
591      NaN
Name: Weight, Length: 592, dtype: float64

In [21]:
nba['Weight'].hasnans # since it has null values, we first need to remove the null values and then change the type.

True

In [22]:
nba['Weight'] = nba['Weight'].fillna(value = 0).astype(int)
nba['Weight']

0      215
1      225
2      195
3      256
4      240
      ... 
587    180
588    190
589    220
590    185
591      0
Name: Weight, Length: 592, dtype: int32

In [23]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      591 non-null    object
 1   Team      591 non-null    object
 2   Position  584 non-null    object
 3   Height    585 non-null    object
 4   Weight    592 non-null    int32 
 5   College   578 non-null    object
 6   Salary    592 non-null    int32 
dtypes: int32(2), object(5)
memory usage: 27.9+ KB


# changing the type to category
    
    Sometimes the number of unique values in a column is very less as compared to the total number of values in that column.
    In this case we can change the data type to category.

In [25]:
nba['Position'].value_counts()

Position
G      229
F      187
C       47
G-F     46
F-C     37
C-F     23
F-G     15
Name: count, dtype: int64

In [27]:
nba['Position'] = nba['Position'].astype('category')
nba['Position']

0        F
1        G
2        G
3        C
4      F-C
      ... 
587      G
588      G
589      F
590      G
591    NaN
Name: Position, Length: 592, dtype: category
Categories (7, object): ['C', 'C-F', 'F', 'F-C', 'F-G', 'G', 'G-F']

In [28]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 592 entries, 0 to 591
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      591 non-null    object  
 1   Team      591 non-null    object  
 2   Position  584 non-null    category
 3   Height    585 non-null    object  
 4   Weight    592 non-null    int32   
 5   College   578 non-null    object  
 6   Salary    592 non-null    int32   
dtypes: category(1), int32(2), object(4)
memory usage: 24.2+ KB
