# Intro To DataFrames

In [1]:
import pandas as pd

In [2]:
# if a column contains a NULL pandas will convert all values to float even if they are ints in the csv(Number, Age, etc...)
nba = pd.read_csv("nba.csv")
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0


# Shared Methods and Attributes Between Series and DataFrames

In [3]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [4]:
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [5]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [6]:
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [7]:
# returns tuple with number of rows and number of columns
nba.shape

(458, 9)

In [8]:
# get data types of each column in dataframe
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

# New DataFrame Attributes and Methods

In [9]:
# list of column names in dataframe
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [10]:
# bundles .index and .columns into single output
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [11]:
# provide summary of dataframe
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     373 non-null object
Salary      446 non-null float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [12]:
# get count of different data types in dataframe
nba.get_dtype_counts()

float64    4
object     5
dtype: int64

# Differences Between Shared Methods

In [13]:
rev = pd.read_csv("revenue.csv", index_col = "Date")
rev.head()

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253


In [14]:
s = pd.Series([1, 2, 3])
s.sum()

6

In [15]:
# .sum() on dataframe returns Series with column names as index labels and sum of column values as Series values
rev.sum()

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [16]:
# use axis parameter to sum values horizontally across rows. both ways bellow will work. NOT available on Series
rev.sum(axis=1)
rev.sum(axis="columns")

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

# Select One Column From DataFrame

In [17]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [18]:
# return a Series of values for specified column. DOEN NOT WORK if column name contains spaces
nba.Name

0                Avery Bradley
1                  Jae Crowder
2                 John Holland
3                  R.J. Hunter
4                Jonas Jerebko
5                 Amir Johnson
6                Jordan Mickey
7                 Kelly Olynyk
8                 Terry Rozier
9                 Marcus Smart
10             Jared Sullinger
11               Isaiah Thomas
12                 Evan Turner
13                 James Young
14                Tyler Zeller
15            Bojan Bogdanovic
16                Markel Brown
17             Wayne Ellington
18     Rondae Hollis-Jefferson
19                Jarrett Jack
20              Sergey Karasev
21             Sean Kilpatrick
22                Shane Larkin
23                 Brook Lopez
24            Chris McCullough
25                 Willie Reed
26             Thomas Robinson
27                  Henry Sims
28                Donald Sloan
29              Thaddeus Young
                ...           
428            Al-Farouq Aminu
429     

In [19]:
# getting column values with brackets WORKS if column name has spaces
nba["Name"]

0                Avery Bradley
1                  Jae Crowder
2                 John Holland
3                  R.J. Hunter
4                Jonas Jerebko
5                 Amir Johnson
6                Jordan Mickey
7                 Kelly Olynyk
8                 Terry Rozier
9                 Marcus Smart
10             Jared Sullinger
11               Isaiah Thomas
12                 Evan Turner
13                 James Young
14                Tyler Zeller
15            Bojan Bogdanovic
16                Markel Brown
17             Wayne Ellington
18     Rondae Hollis-Jefferson
19                Jarrett Jack
20              Sergey Karasev
21             Sean Kilpatrick
22                Shane Larkin
23                 Brook Lopez
24            Chris McCullough
25                 Willie Reed
26             Thomas Robinson
27                  Henry Sims
28                Donald Sloan
29              Thaddeus Young
                ...           
428            Al-Farouq Aminu
429     

In [20]:
# returns Series
type(nba["Salary"])

pandas.core.series.Series

# Select Two or More Columns from A DataFrame

In [21]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [22]:
# return multiple columns by passing a list to brackets. order of columns in list determines order of columns of returned dataframe
nba[ ["Position", "Name", "Team" ] ]

Unnamed: 0,Position,Name,Team
0,PG,Avery Bradley,Boston Celtics
1,SF,Jae Crowder,Boston Celtics
2,SG,John Holland,Boston Celtics
3,SG,R.J. Hunter,Boston Celtics
4,PF,Jonas Jerebko,Boston Celtics
5,PF,Amir Johnson,Boston Celtics
6,PF,Jordan Mickey,Boston Celtics
7,C,Kelly Olynyk,Boston Celtics
8,PG,Terry Rozier,Boston Celtics
9,PG,Marcus Smart,Boston Celtics


In [23]:
# can assign list of columns to a variable and pass variable to brackets to select columns. looks nicer
select = ["Team", "Name", "Salary"]
nba[select].head()

Unnamed: 0,Team,Name,Salary
0,Boston Celtics,Avery Bradley,7730337.0
1,Boston Celtics,Jae Crowder,6796117.0
2,Boston Celtics,John Holland,
3,Boston Celtics,R.J. Hunter,1148640.0
4,Boston Celtics,Jonas Jerebko,5000000.0


# Add New Column to DataFrame

In [24]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [25]:
# get error if column does not exist
nba["Sport"]

KeyError: 'Sport'

In [27]:
# add new column to dataframe with '='. if column in brackets already exists VALUES WILL BE OVERWRITTEN.
# "Basketball" is a scaler value, all values in column will be "Basketball"
nba["Sport"] = "Basketball"
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball


In [28]:
nba["League"] = "National BAsketball Association"
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National BAsketball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National BAsketball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National BAsketball Association


In [29]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [30]:
# use insert to add columns to dataframe. first argument is where to insert new column into dataframe
nba.insert(3, column = "Sport", value = "Basketball")

In [31]:
nba.head(3)

Unnamed: 0,Name,Team,Number,Sport,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,Basketball,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,Basketball,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,Basketball,SG,27.0,6-5,205.0,Boston University,


# Broadcasting Operations

In [32]:
# broadcasting operations are called on each individual values of the dataframe(apply, map, etc...)
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [33]:
# add 5 to each value in the Age column. NULL values are skipped
nba["Age"].add(5)
nba["Age"] + 5

0      30.0
1      30.0
2      32.0
3      27.0
4      34.0
5      34.0
6      26.0
7      30.0
8      27.0
9      27.0
10     29.0
11     32.0
12     32.0
13     25.0
14     31.0
15     32.0
16     29.0
17     33.0
18     26.0
19     37.0
20     27.0
21     31.0
22     28.0
23     33.0
24     26.0
25     31.0
26     30.0
27     31.0
28     33.0
29     32.0
       ... 
428    30.0
429    28.0
430    29.0
431    32.0
432    28.0
433    33.0
434    39.0
435    29.0
436    30.0
437    29.0
438    28.0
439    31.0
440    35.0
441    25.0
442    33.0
443    28.0
444    29.0
445    25.0
446    29.0
447    28.0
448    31.0
449    28.0
450    33.0
451    31.0
452    25.0
453    31.0
454    29.0
455    31.0
456    31.0
457     NaN
Name: Age, Length: 458, dtype: float64

In [34]:
# subtract 5,000,000 from each value in the Salary column
nba["Salary"].sub(5000000)
nba["Salary"] - 5000000

0       2730337.0
1       1796117.0
2             NaN
3      -3851360.0
4             0.0
5       7000000.0
6      -3829040.0
7      -2834840.0
8      -3175640.0
9      -1568960.0
10     -2430740.0
11      1912869.0
12     -1574490.0
13     -3250160.0
14     -2383025.0
15     -1574490.0
16     -4154941.0
17     -3500000.0
18     -3664520.0
19      1300000.0
20     -3400160.0
21     -4865785.0
22     -3500000.0
23     14689000.0
24     -3859760.0
25     -4052724.0
26     -4018652.0
27     -4052724.0
28     -4052724.0
29      6235955.0
          ...    
428     3042895.0
429    -4374907.0
430    -4052724.0
431     1980802.0
432    -2105941.0
433     1000000.0
434       16000.0
435    -1924120.0
436     -763713.0
437    -2474840.0
438    -4474907.0
439    -3584480.0
440    -2145060.0
441    -2362280.0
442     -225000.0
443    -2341760.0
444     4463484.0
445    -1222280.0
446     7000000.0
447    -3824120.0
448    10409570.0
449    -3651560.0
450    -2950000.0
451    -4018652.0
452    -27

In [26]:
# convert player weights from pounds to kg
nba["Weight"].mul(.453592)

0       81.646560
1      106.594120
2       92.986360
3       83.914520
4      104.779752
5      108.862080
6      106.594120
7      107.954896
8       86.182480
9       99.790240
10     117.933920
11      83.914520
12      99.790240
13      97.522280
14     114.758776
15      97.975872
16      86.182480
17      90.718400
18      99.790240
19      90.718400
20      94.347136
21      99.336648
22      79.378600
23     124.737800
24      90.718400
25      99.790240
26     107.501304
27     112.490816
28      92.986360
29     100.243832
          ...    
428     97.522280
429     93.439952
430     95.254320
431    108.862080
432     97.522280
433     97.522280
434    120.201880
435    111.130040
436     88.450440
437     90.718400
438     83.914520
439    106.594120
440     78.471416
441    108.862080
442    103.418976
443     86.636072
444     97.068688
445     86.182480
446    120.201880
447    111.130040
448    102.511792
449     93.439952
450    102.511792
451     93.439952
452    106

In [41]:
# create new column in dataframe using broadcast operation
nba["Weight in Kilograms"] = nba["Weight"] * .453592
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752


In [49]:
nba["Salary"].div(1000000)
nba["Salary in Millions"] = nba["Salary"] / 1000000
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Weight in Kilograms,Salary in Millions
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,81.64656,7.730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,106.59412,6.796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,92.98636,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,83.91452,1.14864
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,104.779752,5.0


# A Review of the .value_counts() Method

In [51]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [53]:
# .value_counts() ONLY works os Series
nba["Team"].value_counts()

New Orleans Pelicans      19
Memphis Grizzlies         18
New York Knicks           16
Milwaukee Bucks           16
Detroit Pistons           15
Portland Trail Blazers    15
Oklahoma City Thunder     15
Atlanta Hawks             15
Charlotte Hornets         15
Miami Heat                15
Los Angeles Lakers        15
Philadelphia 76ers        15
Dallas Mavericks          15
Washington Wizards        15
Golden State Warriors     15
Brooklyn Nets             15
Phoenix Suns              15
Houston Rockets           15
San Antonio Spurs         15
Boston Celtics            15
Los Angeles Clippers      15
Cleveland Cavaliers       15
Utah Jazz                 15
Sacramento Kings          15
Denver Nuggets            15
Indiana Pacers            15
Chicago Bulls             15
Toronto Raptors           15
Orlando Magic             14
Minnesota Timberwolves    14
Name: Team, dtype: int64

In [55]:
nba["Position"].value_counts().head(1)

SG    102
Name: Position, dtype: int64

In [58]:
nba["Weight"].value_counts().tail()

278.0    1
227.0    1
239.0    1
289.0    1
307.0    1
Name: Weight, dtype: int64

In [59]:
nba["Salary"].value_counts()

947276.0      31
845059.0      18
525093.0      13
981348.0       6
1100602.0      5
16407500.0     5
5000000.0      5
12000000.0     5
8000000.0      5
4000000.0      5
3000000.0      4
7000000.0      4
2814000.0      4
1000000.0      4
19689000.0     4
200600.0       3
8500000.0      3
2500000.0      3
1015421.0      3
2854940.0      3
13500000.0     3
5543725.0      3
2288205.0      2
1270964.0      2
2900000.0      2
1007026.0      2
111444.0       2
13000000.0     2
1500000.0      2
1842000.0      2
              ..
2239800.0      1
1474440.0      1
19688000.0     1
7900000.0      1
2008748.0      1
13800000.0     1
2841960.0      1
1404600.0      1
1584480.0      1
273038.0       1
9213483.0      1
3272091.0      1
3075880.0      1
2250000.0      1
4626960.0      1
1304520.0      1
12100000.0     1
7500000.0      1
295327.0       1
2836186.0      1
6486486.0      1
5016000.0      1
3333333.0      1
1824360.0      1
8042895.0      1
1242720.0      1
2489530.0      1
5103120.0     

# Drop Rows with Null Values

In [63]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [64]:
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [67]:
# .dropna() removes all rows from dataframe that contain a Null value
nba.dropna()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0
10,Jared Sullinger,Boston Celtics,7.0,C,24.0,6-9,260.0,Ohio State,2569260.0
11,Isaiah Thomas,Boston Celtics,4.0,PG,27.0,5-9,185.0,Washington,6912869.0
12,Evan Turner,Boston Celtics,11.0,SG,27.0,6-7,220.0,Ohio State,3425510.0


In [73]:
# use how parameter to only remove rows where all columns are Null
nba.dropna(how="all", inplace=True)

In [74]:
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


In [76]:
# use axis parameter to remove columns that contain Null values instead of rows
nba.dropna(axis=1)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0


In [78]:
# use subset parameter to drop rows if they have a Null value in the specified column
nba.dropna(subset=["Salary"])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
5,Amir Johnson,Boston Celtics,90.0,PF,29.0,6-9,240.0,,12000000.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0
10,Jared Sullinger,Boston Celtics,7.0,C,24.0,6-9,260.0,Ohio State,2569260.0


# Fill in Null Values with the .fillna() Method

In [79]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [83]:
# replace EVERY Null value in dataframe with specified value 
nba.fillna(0).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,0,5000000.0


In [87]:
# call .fillna() on Series from dataframe to alter selected column
nba["Salary"].fillna(0, inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [89]:
nba["College"].fillna("No College", inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0


# The .astype() Method

In [93]:
# .astype() requires the Series to NOT have any Null values
nba = pd.read_csv("nba.csv").dropna(how="all")
nba["Salary"].fillna(0, inplace=True)
nba["College"].fillna("No College", inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,No College,5000000.0


In [96]:
nba.dtypes
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null float64
Position    457 non-null object
Age         457 non-null float64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null float64
dtypes: float64(4), object(5)
memory usage: 35.7+ KB


In [100]:
# use .astype() to convert column values to different types
# .astype() DOES NOT have the inplace parameter
nba["Salary"] = nba["Salary"].astype("int")

In [101]:
# salary is now type int
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary        int64
dtype: object

In [102]:
nba["Number"] = nba["Number"].astype("int")
nba["Age"] = nba["Age"].astype("int")

In [103]:
nba.dtypes

Name         object
Team         object
Number        int64
Position     object
Age           int64
Height       object
Weight      float64
College      object
Salary        int64
dtype: object

In [105]:
# use .nunique() method to get number of unique items in a column/Series
nba["Position"].nunique()

5

In [110]:
# use category type when you have a small number of unique items in a dataframe. helps reduce memory usage
nba["Position"] = nba["Position"].astype("category")

In [107]:
# converting Position to category lowered memory usage from 35.7 KB to 32.8 KB
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null object
Number      457 non-null int64
Position    457 non-null category
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: category(1), float64(1), int64(3), object(4)
memory usage: 32.8+ KB


In [108]:
nba["Team"] = nba["Team"].astype("category")

In [109]:
# converting Team to category lowered memory usage from 32.8 KB to 31.1 KB
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457 entries, 0 to 456
Data columns (total 9 columns):
Name        457 non-null object
Team        457 non-null category
Number      457 non-null int64
Position    457 non-null category
Age         457 non-null int64
Height      457 non-null object
Weight      457 non-null float64
College     457 non-null object
Salary      457 non-null int64
dtypes: category(2), float64(1), int64(3), object(3)
memory usage: 31.1+ KB


# Sort a DataFrame with the .sort_values() Method, Part 1 

In [111]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [118]:
# must tell Pandas what column you want to sort dataframe by
nba.sort_values("Name")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
356,Aaron Gordon,Orlando Magic,0.0,PF,20.0,6-9,220.0,Arizona,4171680.0
328,Aaron Harrison,Charlotte Hornets,9.0,SG,21.0,6-6,210.0,Kentucky,525093.0
404,Adreian Payne,Minnesota Timberwolves,33.0,PF,25.0,6-10,237.0,Michigan State,1938840.0
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
330,Al Jefferson,Charlotte Hornets,25.0,C,31.0,6-10,289.0,,13500000.0
428,Al-Farouq Aminu,Portland Trail Blazers,8.0,SF,25.0,6-9,215.0,Wake Forest,8042895.0
368,Alan Anderson,Washington Wizards,6.0,SG,33.0,6-6,220.0,Michigan State,4000000.0
135,Alan Williams,Phoenix Suns,15.0,C,23.0,6-8,260.0,UC Santa Barbara,83397.0
444,Alec Burks,Utah Jazz,10.0,SG,24.0,6-6,214.0,Colorado,9463484.0


In [116]:
nba.sort_values("Age", ascending=False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,Utah,250750.0
400,Kevin Garnett,Minnesota Timberwolves,21.0,PF,40.0,6-11,240.0,,8500000.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,Wake Forest,5250000.0
261,Vince Carter,Memphis Grizzlies,15.0,SG,39.0,6-6,220.0,North Carolina,4088019.0
102,Pablo Prigioni,Los Angeles Clippers,9.0,PG,39.0,6-3,185.0,,947726.0


In [117]:
nba.sort_values("Salary", inplace=True)
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
32,Thanasis Antetokounmpo,New York Knicks,43.0,SF,23.0,6-7,205.0,,30888.0
291,Orlando Johnson,New Orleans Pelicans,0.0,SG,27.0,6-5,220.0,UC Santa Barbara,55722.0
130,Phil Pressey,Phoenix Suns,25.0,PG,25.0,5-11,175.0,Missouri,55722.0
135,Alan Williams,Phoenix Suns,15.0,C,23.0,6-8,260.0,UC Santa Barbara,83397.0
175,Jordan McRae,Cleveland Cavaliers,12.0,SG,25.0,6-5,179.0,Tennessee,111196.0


In [119]:
# Null values default to the end of of the sorted dataframe
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
397,Axel Toupane,Denver Nuggets,6.0,SG,23.0,6-7,210.0,,
409,Greg Smith,Minnesota Timberwolves,4.0,PF,25.0,6-10,250.0,Fresno State,
457,,,,,,,,,


In [121]:
# use na_position parameter to put Null values at top of sorted dataframe
nba.sort_values("Salary", na_position="first").head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
46,Elton Brand,Philadelphia 76ers,42.0,PF,37.0,6-9,254.0,Duke,
171,Dahntay Jones,Cleveland Cavaliers,30.0,SG,35.0,6-6,225.0,Duke,
264,Jordan Farmar,Memphis Grizzlies,4.0,PG,29.0,6-2,180.0,UCLA,
269,Ray McCallum,Memphis Grizzlies,5.0,PG,24.0,6-3,190.0,Detroit,


# Sort a DataFrame with the .sort_values() Method, Part 2

In [122]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [123]:
# can sort dataframe by multiple values
nba.sort_values(["Team", "Name"])

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
312,Al Horford,Atlanta Hawks,15.0,C,30.0,6-10,245.0,Florida,12000000.0
318,Dennis Schroder,Atlanta Hawks,17.0,PG,22.0,6-1,172.0,,1763400.0
323,Jeff Teague,Atlanta Hawks,0.0,PG,27.0,6-2,186.0,Wake Forest,8000000.0
309,Kent Bazemore,Atlanta Hawks,24.0,SF,26.0,6-5,201.0,Old Dominion,2000000.0
311,Kirk Hinrich,Atlanta Hawks,12.0,SG,35.0,6-4,190.0,Kansas,2854940.0
313,Kris Humphries,Atlanta Hawks,43.0,PF,31.0,6-9,235.0,Minnesota,1000000.0
314,Kyle Korver,Atlanta Hawks,26.0,SG,35.0,6-7,212.0,Creighton,5746479.0
317,Lamar Patterson,Atlanta Hawks,13.0,SG,24.0,6-5,225.0,Pittsburgh,525093.0
316,Mike Muscala,Atlanta Hawks,31.0,PF,24.0,6-11,240.0,Bucknell,947276.0
319,Mike Scott,Atlanta Hawks,32.0,PF,27.0,6-8,237.0,Virginia,3333333.0


In [125]:
# can sort multiple columns in ascending order
nba.sort_values(["Team", "Name"], ascending=False)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
379,Ramon Sessions,Washington Wizards,7.0,PG,30.0,6-3,190.0,Nevada,2170465.0
378,Otto Porter Jr.,Washington Wizards,22.0,SF,23.0,6-8,198.0,Georgetown,4662960.0
375,Nene Hilario,Washington Wizards,42.0,C,33.0,6-11,250.0,,13000000.0
376,Markieff Morris,Washington Wizards,5.0,PF,26.0,6-10,245.0,Kansas,8000000.0
381,Marcus Thornton,Washington Wizards,15.0,SF,29.0,6-4,205.0,LSU,200600.0
373,Marcin Gortat,Washington Wizards,13.0,C,32.0,6-11,240.0,,11217391.0
377,Kelly Oubre Jr.,Washington Wizards,12.0,SF,20.0,6-7,205.0,Kansas,1920240.0
382,John Wall,Washington Wizards,2.0,PG,25.0,6-4,195.0,Kentucky,15851950.0
371,Jarell Eddie,Washington Wizards,8.0,SG,24.0,6-7,218.0,Virginia Tech,561716.0
370,Jared Dudley,Washington Wizards,1.0,SF,30.0,6-7,225.0,Boston College,4375000.0


In [129]:
# can combine sorting in ascending and descending order by using a list as the argument
nba.sort_values(["Team", "Name"], ascending=[False, True], inplace=True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
368,Alan Anderson,Washington Wizards,6.0,SG,33.0,6-6,220.0,Michigan State,4000000.0
369,Bradley Beal,Washington Wizards,3.0,SG,22.0,6-5,207.0,Florida,5694674.0
372,Drew Gooden,Washington Wizards,90.0,PF,34.0,6-10,250.0,Kansas,3300000.0


# Sort DataFrame with the .sort_index() Method

In [130]:
nba = pd.read_csv("nba.csv")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


In [132]:
nba.sort_values(["Number", "Salary", "Name"], inplace=True)
nba.tail(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
68,Lucas Nogueira,Toronto Raptors,92.0,C,23.0,7-0,220.0,,1842000.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
457,,,,,,,,,


In [134]:
# sort rows in dataframe by index
nba.sort_index().head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [136]:
nba.sort_index(ascending=False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
457,,,,,,,,,
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [137]:
nba.sort_index(inplace=True)
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,


# Rank Values with the .rank() Method

In [141]:
# get rid of Null values in column you will be using with .rank()
nba = pd.read_csv("nba.csv").dropna(how="all")
nba["Salary"] = nba["Salary"].fillna(0).astype("int")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000


In [147]:
# .rank() is called on single Series in dataframe
nba["Salary Rank"] = nba["Salary"].rank(ascending=False).astype("int")
nba.head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337,97
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117,110
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,0,452


In [152]:
nba.sort_values("Salary", ascending=False).head(3)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Salary Rank
109,Kobe Bryant,Los Angeles Lakers,24.0,SF,37.0,6-6,212.0,,25000000,1
169,LeBron James,Cleveland Cavaliers,23.0,SF,31.0,6-8,250.0,,22970500,2
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000,3
