## Foundations of Data Science: Computational Thinking with Python

Mirroring course Jupyter notebook except using standard modules instead of Berkeley data science module.

## Lecture 5: Strings, Minard's Map and Building Tables

In [1]:
from IPython.display import display  # Display in the same as the last cell line
import matplotlib.pyplot as plt
%matplotlib inline  
plt.style.use('fivethirtyeight')
import numpy as np
import pandas as pd
pd.set_option('max_rows', 7)

### Creating Tables

In [2]:
# Import .cvs into dataframe
movies = pd.read_csv('data/top_movies_2017.csv')
movies.size

1000

In [3]:
# Create dataframe from array
col_names = ['Name', 'Year', 'Gross']
movie_array = [['Gone with the Wind', 1939, 198676459],
               ['Star Wars', 1977, 460998007]]

movies = pd.DataFrame(movie_array, columns=col_names)

In [4]:
# Number of rows and columns
movies.shape

(2, 3)

In [5]:
# Select only columns with specified names
movies.loc[:, ['Gross']]   # returns dataframe
movies.loc[:, 'Gross']  # returns series

movies.loc[:, ['Year', 'Gross']]   # returns dataframe

Unnamed: 0,Year,Gross
0,1939,198676459
1,1977,460998007


In [6]:
# Select all columns except those with specified column names
movies.drop(['Gross'], axis=1)

Unnamed: 0,Name,Year
0,Gone with the Wind,1939
1,Star Wars,1977


In [7]:
# Apply function to specific column(s) 
movies.loc[:, ['Name', 'Gross']].max()   # Applys to each column, returns series
movies.loc[:, ['Gross']].max()  # returns data type same as type in column

Gross    460998007
dtype: int64

In [8]:
# Relabel columns - inplace=True to alter dataframe, =False to return new dataframe
movies.rename(columns={'Name': 'Title', 'Gross': 'Monies'}, inplace=True) 
movies

Unnamed: 0,Title,Year,Monies
0,Gone with the Wind,1939,198676459
1,Star Wars,1977,460998007


In [9]:
movies.rename(mapper=str.lower, axis='columns', inplace=True) 
movies

Unnamed: 0,title,year,monies
0,Gone with the Wind,1939,198676459
1,Star Wars,1977,460998007


In [10]:
movies.rename(mapper=str.capitalize, axis='columns', inplace=True) 
movies

# For title capitalization
# movies.rename(mapper=str.title, axis='columns', inplace=True) 

Unnamed: 0,Title,Year,Monies
0,Gone with the Wind,1939,198676459
1,Star Wars,1977,460998007


In [11]:
# adding rows
new_rows = pd.DataFrame([['The Sound of Music', 1965, 158671368],
                         ['E.T.: The Extra-Terrestrial', 1982, 1261085000]],
                        columns=movies.columns)
movies = pd.concat([movies, new_rows], ignore_index=True)
movies

Unnamed: 0,Title,Year,Monies
0,Gone with the Wind,1939,198676459
1,Star Wars,1977,460998007
2,The Sound of Music,1965,158671368
3,E.T.: The Extra-Terrestrial,1982,1261085000


### Minard's Map

In [12]:
# Import csv to data table
minard = pd.read_csv('data/minard.csv')
minard

Unnamed: 0,Longitude,Latitude,City,Direction,Survivors
0,32.0,54.8,Smolensk,Advance,145000
1,33.2,54.9,Dorogobouge,Advance,140000
2,34.4,55.5,Chjat,Advance,127100
...,...,...,...,...,...
5,32.0,54.6,Smolensk,Retreat,24000
6,30.4,54.4,Orscha,Retreat,20000
7,26.8,54.3,Moiodexno,Retreat,12000


In [13]:
# Number of rows and columns
print('{0} rows, {1} columns'.format(*minard.shape))

8 rows, 5 columns


In [14]:
# Column names
minard.columns

Index(['Longitude', 'Latitude', 'City', 'Direction', 'Survivors'], dtype='object')

In [15]:
minard.loc[:, 'Survivors']   # look up by column name
minard.iloc[:, 4]  # look up by column index

0    145000
1    140000
2    127100
      ...  
5     24000
6     20000
7     12000
Name: Survivors, Length: 8, dtype: int64

In [16]:
initial_size = minard.loc[:, 'Survivors'][0]
initial_size

145000

In [17]:
percent_surviving = round(minard.loc[:, 'Survivors'] / initial_size * 100, 2)

percent_surviving

0    100.00
1     96.55
2     87.66
      ...  
5     16.55
6     13.79
7      8.28
Name: Survivors, Length: 8, dtype: float64

In [18]:
minard.loc[:, 'Percent Surviving'] = percent_surviving
with pd.option_context("display.max_rows", 10):
    display(minard)

Unnamed: 0,Longitude,Latitude,City,Direction,Survivors,Percent Surviving
0,32.0,54.8,Smolensk,Advance,145000,100.0
1,33.2,54.9,Dorogobouge,Advance,140000,96.55
2,34.4,55.5,Chjat,Advance,127100,87.66
3,37.6,55.8,Moscou,Advance,100000,68.97
4,34.3,55.2,Wixma,Retreat,55000,37.93
5,32.0,54.6,Smolensk,Retreat,24000,16.55
6,30.4,54.4,Orscha,Retreat,20000,13.79
7,26.8,54.3,Moiodexno,Retreat,12000,8.28


### Take

In [19]:
# indexes by position not actual values in the index column
minard.take([1,2])


Unnamed: 0,Longitude,Latitude,City,Direction,Survivors,Percent Surviving
1,33.2,54.9,Dorogobouge,Advance,140000,96.55
2,34.4,55.5,Chjat,Advance,127100,87.66


In [20]:
minard.take(range(2,5))

Unnamed: 0,Longitude,Latitude,City,Direction,Survivors,Percent Surviving
2,34.4,55.5,Chjat,Advance,127100,87.66
3,37.6,55.8,Moscou,Advance,100000,68.97
4,34.3,55.2,Wixma,Retreat,55000,37.93


### Where

In [21]:
# load
nba = pd.read_csv('data/nba_salaries.csv')
nba.head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,2015-2016 SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0
2,Tiago Splitter,C,Atlanta Hawks,9.75625
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479


In [22]:
# rename column
nba.rename(columns={'2015-2016 SALARY': 'SALARY'}, inplace=True)
nba.head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
0,Paul Millsap,PF,Atlanta Hawks,18.671659
1,Al Horford,C,Atlanta Hawks,12.0
2,Tiago Splitter,C,Atlanta Hawks,9.75625
3,Jeff Teague,PG,Atlanta Hawks,8.0
4,Kyle Korver,SG,Atlanta Hawks,5.746479


In [23]:
# sort
nba.sort_values('SALARY', ascending=False, inplace=True)
nba.head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
169,Kobe Bryant,SF,Los Angeles Lakers,25.0
29,Joe Johnson,SF,Brooklyn Nets,24.894863
72,LeBron James,SF,Cleveland Cavaliers,22.9705
255,Carmelo Anthony,SF,New York Knicks,22.875
131,Dwight Howard,C,Houston Rockets,22.359364


In [24]:
# Where salary > 10
above_10 = nba['SALARY'] > 10   # returns series of index and Booleans 
above_10[:5]

169    True
29     True
72     True
255    True
131    True
Name: SALARY, dtype: bool

In [25]:
nba.loc[nba['SALARY'] > 10].tail(5)  # returns data frame where condition is true 

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
95,Wilson Chandler,SF,Denver Nuggets,10.449438
144,Monta Ellis,SG,Indiana Pacers,10.3
204,Luol Deng,SF,Miami Heat,10.151612
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855
368,DeMar DeRozan,SG,Toronto Raptors,10.05


In [26]:
# Where between
nba.loc[(nba['SALARY'] > 10) & (nba['SALARY'] < 11)]  # returns data frame where condition is true 

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
240,Tyreke Evans,SG,New Orleans Pelicans,10.734586
241,Jrue Holiday,PG,New Orleans Pelicans,10.595507
76,Brendan Haywood,C,Cleveland Cavaliers,10.522500
...,...,...,...,...
204,Luol Deng,SF,Miami Heat,10.151612
298,Gerald Wallace,SF,Philadelphia 76ers,10.105855
368,DeMar DeRozan,SG,Toronto Raptors,10.050000


In [27]:
# Where text = condition
nba.loc[nba['TEAM'] == 'Toronto Raptors'].head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
366,DeMarre Carroll,SF,Toronto Raptors,13.6
367,Kyle Lowry,PG,Toronto Raptors,12.0
368,DeMar DeRozan,SG,Toronto Raptors,10.05
369,Cory Joseph,PG,Toronto Raptors,7.0
370,Patrick Patterson,PF,Toronto Raptors,6.268675


In [28]:
# Where text contains
nba.loc[nba['TEAM'].str.contains('raptors', case=False)].head(5)

Unnamed: 0,PLAYER,POSITION,TEAM,SALARY
366,DeMarre Carroll,SF,Toronto Raptors,13.6
367,Kyle Lowry,PG,Toronto Raptors,12.0
368,DeMar DeRozan,SG,Toronto Raptors,10.05
369,Cory Joseph,PG,Toronto Raptors,7.0
370,Patrick Patterson,PF,Toronto Raptors,6.268675
