## Intro to Dataframes Module

In [5]:
import pandas as pd

In [6]:
nba = pd.read_csv("nba.csv")

## Shared Methods and Attributes between Series and Dataframe

In [7]:
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [8]:
nba.tail() ## Note final row: all NaN 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0
457,,,,,,,,,


In [10]:
nba.index

RangeIndex(start=0, stop=458, step=1)

In [11]:
nba.values

array([['Avery Bradley', 'Boston Celtics', 0.0, ..., 180.0, 'Texas',
        7730337.0],
       ['Jae Crowder', 'Boston Celtics', 99.0, ..., 235.0, 'Marquette',
        6796117.0],
       ['John Holland', 'Boston Celtics', 30.0, ..., 205.0,
        'Boston University', nan],
       ...,
       ['Tibor Pleiss', 'Utah Jazz', 21.0, ..., 256.0, nan, 2900000.0],
       ['Jeff Withey', 'Utah Jazz', 24.0, ..., 231.0, 'Kansas', 947276.0],
       [nan, nan, nan, ..., nan, nan, nan]], dtype=object)

In [14]:
nba.shape
nba.shape[0]

458

In [15]:
nba.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [16]:
nba.columns

Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
       'College', 'Salary'],
      dtype='object')

In [17]:
nba.axes

[RangeIndex(start=0, stop=458, step=1),
 Index(['Name', 'Team', 'Number', 'Position', 'Age', 'Height', 'Weight',
        'College', 'Salary'],
       dtype='object')]

In [18]:
nba.info() ## provides a summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      457 non-null    object 
 1   Team      457 non-null    object 
 2   Number    457 non-null    float64
 3   Position  457 non-null    object 
 4   Age       457 non-null    float64
 5   Height    457 non-null    object 
 6   Weight    457 non-null    float64
 7   College   373 non-null    object 
 8   Salary    446 non-null    float64
dtypes: float64(4), object(5)
memory usage: 32.3+ KB


In [22]:
# nba.get_dtype_counts() -- no longer works

nba.dtypes.value_counts()

object     5
float64    4
dtype: int64

## Differences between Shared Methods

In [25]:
rev = pd.read_csv("revenue.csv", index_col="Date") # reading in with Date as index
rev.head(3)

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933


In [28]:
s = pd.Series([1,2,3])
s.sum()

6

In [29]:
rev.sum() # index labels represented by columns, values are the column sums

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [30]:
rev.sum(axis=0) # column sums

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [31]:
rev.sum(axis=1) # row sums (alternate: rev.sum(axis="columns"))

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

## Select One column from a Dataframe

In [33]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [37]:
# Simple way to import a column (not preferred because it doesn't work when there are spaces in column names)
# Single column from a dataframe will be returned as a series

nba.Name
nba.Number
nba.Salary

Output = None # a way to mask output

In [None]:
# Preferred method is to use [] brackets

In [41]:
nba["Name"] # this method requires quotes
nba["Number"]

0       0.0
1      99.0
2      30.0
3      28.0
4       8.0
       ... 
453     8.0
454    25.0
455    21.0
456    24.0
457     NaN
Name: Number, Length: 458, dtype: float64

In [42]:
type(nba["Name"]) # extracted as a series

pandas.core.series.Series

In [43]:
nba["Name"].head(5)

0    Avery Bradley
1      Jae Crowder
2     John Holland
3      R.J. Hunter
4    Jonas Jerebko
Name: Name, dtype: object

## Select Two or more columns from a DataFrame

In [44]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [46]:
nba[["Name","Team"]] # Extract multiple columns with a Python list

Unnamed: 0,Name,Team
0,Avery Bradley,Boston Celtics
1,Jae Crowder,Boston Celtics
2,John Holland,Boston Celtics
3,R.J. Hunter,Boston Celtics
4,Jonas Jerebko,Boston Celtics
...,...,...
453,Shelvin Mack,Utah Jazz
454,Raul Neto,Utah Jazz
455,Tibor Pleiss,Utah Jazz
456,Jeff Withey,Utah Jazz


In [48]:
# A cleaner way of selecting multiple columns
select = ["Salary","Team","Name"]
nba[select]

Unnamed: 0,Salary,Team,Name
0,7730337.0,Boston Celtics,Avery Bradley
1,6796117.0,Boston Celtics,Jae Crowder
2,,Boston Celtics,John Holland
3,1148640.0,Boston Celtics,R.J. Hunter
4,5000000.0,Boston Celtics,Jonas Jerebko
...,...,...,...
453,2433333.0,Utah Jazz,Shelvin Mack
454,900000.0,Utah Jazz,Raul Neto
455,2900000.0,Utah Jazz,Tibor Pleiss
456,947276.0,Utah Jazz,Jeff Withey


## Add New Column to a Dataframe

In [49]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [52]:
nba["Sport"] = "Basketball" # applying a universal value
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball
...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,Basketball
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,Basketball
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,Basketball


In [55]:
nba["League"] = "National Basketball Association"
nba

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary,Sport,League
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0,Basketball,National Basketball Association
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0,Basketball,National Basketball Association
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,,Basketball,National Basketball Association
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0,Basketball,National Basketball Association
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0,Basketball,National Basketball Association
...,...,...,...,...,...,...,...,...,...,...,...
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0,Basketball,National Basketball Association
454,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0,Basketball,National Basketball Association
455,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0,Basketball,National Basketball Association
456,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0,Basketball,National Basketball Association


In [63]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [64]:
# Second syntax option using loc

nba.insert(loc = 0, column="Sport", value="Basketball")
nba

Unnamed: 0,Sport,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Basketball,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Basketball,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,Basketball,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,Basketball,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Basketball,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0
...,...,...,...,...,...,...,...,...,...,...
453,Basketball,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0
454,Basketball,Raul Neto,Utah Jazz,25.0,PG,24.0,6-1,179.0,,900000.0
455,Basketball,Tibor Pleiss,Utah Jazz,21.0,C,26.0,7-3,256.0,,2900000.0
456,Basketball,Jeff Withey,Utah Jazz,24.0,C,26.0,7-0,231.0,Kansas,947276.0


## Broadcasting Operations

In [65]:
# Similar to apply as it was used on Series
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [67]:
nba["Age"].add(5) # accounts for nulls

# Alternative approach that pandas is smart enough to handle
nba["Age"] + 5

0      30.0
1      30.0
2      32.0
3      27.0
4      34.0
       ... 
453    31.0
454    29.0
455    31.0
456    31.0
457     NaN
Name: Age, Length: 458, dtype: float64

In [70]:
# Subtracting 5 million from each salary
nba["Salary"].sub(-5000000)
nba["Salary"] - 5000000

0      2730337.0
1      1796117.0
2            NaN
3     -3851360.0
4            0.0
         ...    
453   -2566667.0
454   -4100000.0
455   -2100000.0
456   -4052724.0
457          NaN
Name: Salary, Length: 458, dtype: float64

In [73]:
# Converting weights into kGs

nba["Weight"].mul(0.453)
nba["Weight"] * 0.453

0       81.540
1      106.455
2       92.865
3       83.805
4      104.643
        ...   
453     91.959
454     81.087
455    115.968
456    104.643
457        NaN
Name: Weight, Length: 458, dtype: float64

In [74]:
# Adding to a new column
nba["Weight in Kilograms"] = nba["Weight"] * 0.453 
nba

In [76]:
nba["Salary in millions"] = nba["Salary"].div(1000000) 

## A Review of the .value_counts Method

In [78]:
nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [82]:
nba["Team"].value_counts() # Value Counts only available by series
nba["Position"].value_counts() 
nba["Weight"].value_counts() 
nba["Salary"].value_counts() 

947276.0     31
845059.0     18
525093.0     13
981348.0      6
1100602.0     5
             ..
1242720.0     1
2489530.0     1
5103120.0     1
9463484.0     1
700902.0      1
Name: Salary, Length: 309, dtype: int64

## Drop Rows with Null Values

## Fill in Null Values with the .fillna() Method

## The .astype() Method

## Sort a Data from with the .sort_values() Method Part 1

## Sort a Data from with the .sort_values() Method Part 2

## Sort a Data from with  the .sort_index() Method

## Rank Values with the .rank() Method