In [1]:
import pandas as pd

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head(2)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0


In [4]:
movie_duration = movie['duration'] > 120
movie_duration.head(3)

movie_title
Avatar                                      True
Pirates of the Caribbean: At World's End    True
Spectre                                     True
Name: duration, dtype: bool

In [5]:
movie_duration.sum()   # movies more than 2 hours

1039

In [6]:
movie_duration.mean()

0.2113506916192026

In [7]:
movie['duration'].dropna().gt(120).mean()

0.21199755152009794

In [8]:
movie_duration.describe()

count      4916
unique        2
top       False
freq       3877
Name: duration, dtype: object

In [9]:
movie_duration.value_counts()

False    3877
True     1039
Name: duration, dtype: int64

In [10]:
movie_duration.value_counts(normalize=True)

False    0.788649
True     0.211351
Name: duration, dtype: float64

In [11]:
actor_likes = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
actor_likes

Unnamed: 0_level_0,actor_1_facebook_likes,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avatar,1000.0,936.0
Pirates of the Caribbean: At World's End,40000.0,5000.0
Spectre,11000.0,393.0
The Dark Knight Rises,27000.0,23000.0
Star Wars: Episode VII - The Force Awakens,131.0,12.0
...,...,...
Signed Sealed Delivered,637.0,470.0
The Following,841.0,593.0
A Plague So Pleasant,0.0,0.0
Shanghai Calling,946.0,719.0


In [12]:
likes_count = actor_likes['actor_1_facebook_likes'] > actor_likes['actor_2_facebook_likes']

In [13]:
likes_count.value_counts()

True     4794
False     109
dtype: int64

In [14]:
likes_count.mean()  # % of movies, actor_1 has more fb likes

0.9777687130328371

In [15]:
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = ((movie.title_year < 2000) | (movie.title_year > 2009))

In [16]:
criteria2.head()

movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: content_rating, dtype: bool

In [17]:
final_critirea = criteria1 & criteria2 & criteria3
final_critirea.value_counts() 

False    4893
True       23
dtype: int64

In [18]:
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

In [19]:
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.title_year >= 2000) | (movie.title_year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3

In [20]:
final_crit_all = final_crit_a | final_crit_b

In [21]:
final_crit_all.value_counts()

False    4748
True      168
dtype: int64

In [22]:
final_crit_all.head(10)

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
John Carter                                   False
Spider-Man 3                                  False
Tangled                                       False
Avengers: Age of Ultron                       False
Harry Potter and the Half-Blood Prince        False
dtype: bool

### using a boolean series as index to data frame, 
### it filters the values of the values only where the series value is true

In [23]:
movie[final_crit_all].head(5)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
The Avengers,Color,Joss Whedon,703.0,173.0,...,21000.0,8.1,1.85,123000
Captain America: Civil War,Color,Anthony Russo,516.0,147.0,...,19000.0,8.2,2.35,72000
The Lovers,Color,Roland Joffé,10.0,109.0,...,525.0,4.5,,677
Guardians of the Galaxy,Color,James Gunn,653.0,121.0,...,14000.0,8.1,2.35,96000


In [24]:
cols = ['imdb_score', 'content_rating', 'title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)

Unnamed: 0_level_0,imdb_score,content_rating,title_year
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight Rises,8.5,PG-13,2012.0
The Avengers,8.1,PG-13,2012.0
Captain America: Civil War,8.2,PG-13,2016.0
The Lovers,4.5,R,2015.0
Guardians of the Galaxy,8.1,PG-13,2014.0
Interstellar,8.6,PG-13,2014.0
Inception,8.8,PG-13,2010.0
The Martian,8.1,PG-13,2015.0
Town & Country,4.4,R,2001.0
Sex and the City 2,4.3,R,2010.0


In [25]:
college = pd.read_csv('data/college.csv')

In [26]:
college[college['STABBR'] == 'TX'].head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3610,Abilene Christian University,Abilene,TX,0.0,...,0.5527,0.0381,40200,25985
3611,Alvin Community College,Alvin,TX,0.0,...,0.0625,0.2841,34500,6750
3612,Amarillo College,Amarillo,TX,0.0,...,0.1573,0.3431,31700,10950
3613,Angelina College,Lufkin,TX,0.0,...,0.0,0.2603,26900,PrivacySuppressed
3614,Angelo State University,San Angelo,TX,0.0,...,0.5279,0.1407,37700,21319.5


In [27]:
college2 = college.set_index('STABBR')
college2.loc['TX'].head()

Unnamed: 0_level_0,INSTNM,CITY,HBCU,MENONLY,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TX,Abilene Christian University,Abilene,0.0,0.0,...,0.5527,0.0381,40200,25985
TX,Alvin Community College,Alvin,0.0,0.0,...,0.0625,0.2841,34500,6750
TX,Amarillo College,Amarillo,0.0,0.0,...,0.1573,0.3431,31700,10950
TX,Angelina College,Lufkin,0.0,0.0,...,0.0,0.2603,26900,PrivacySuppressed
TX,Angelo State University,San Angelo,0.0,0.0,...,0.5279,0.1407,37700,21319.5


#### Boolean indexing takes more time

In [28]:
%timeit college[college['STABBR'] == 'TX']

551 µs ± 2.47 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [29]:
%timeit college2.loc['TX']

231 µs ± 1.59 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### setting index takes time, but it needs to be done only once.

In [30]:
%timeit college2 = college.set_index('STABBR') 

507 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
college.head(2) # without index

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5


In [32]:
college2.head(2)  # with index

Unnamed: 0_level_0,INSTNM,CITY,HBCU,MENONLY,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AL,Alabama A & M University,Normal,1.0,0.0,...,0.8284,0.1049,30300,33888.0
AL,University of Alabama at Birmingham,Birmingham,0.0,0.0,...,0.5214,0.2422,39700,21941.5


In [33]:
college3 = college2.sort_index()
college3.index.is_monotonic

True

In [34]:
%timeit college[college['STABBR'] == 'TX']

551 µs ± 2.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [35]:
%timeit college2.loc['TX']

230 µs ± 2.19 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [36]:
%timeit college3.loc['TX']    # index sorted will be fast in searching

33.9 µs ± 55.8 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [37]:
college3.index.is_unique

False

In [38]:
college_unique = college.set_index('INSTNM')
college_unique.index.is_unique

True

In [39]:
college[college['INSTNM'] == 'Stanford University']

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4217,Stanford University,Stanford,CA,0.0,...,0.1256,0.0401,86000,12782


In [40]:
college_unique.loc['Stanford University']

CITY                  Stanford
STABBR                      CA
HBCU                       0.0
MENONLY                    0.0
WOMENONLY                  0.0
                        ...   
PCTPELL                 0.1556
PCTFLOAN                0.1256
UG25ABV                 0.0401
MD_EARN_WNE_P10          86000
GRAD_DEBT_MDN_SUPP       12782
Name: Stanford University, Length: 26, dtype: object

In [41]:
%timeit college[college['INSTNM'] == 'Stanford University']

498 µs ± 2.08 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### unique and sorted index will be more faster

In [42]:
%timeit college_unique.loc['Stanford University']

66.3 µs ± 225 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [43]:
### Indexing with mor enumber of columns

In [44]:
college.head(4)

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,...,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,...,0.4596,0.264,45500,24097.0


In [45]:
college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
"ARTESIA, CA",Angeles Institute,ARTESIA,CA,0.0,...,0.8138,0.5429,,16850
"Aberdeen, SD",Presentation College,Aberdeen,SD,0.0,...,0.756,0.3097,35900.0,25000
"Aberdeen, SD",Northern State University,Aberdeen,SD,0.0,...,0.4303,0.1766,33600.0,24847
"Aberdeen, WA",Grays Harbor College,Aberdeen,WA,0.0,...,0.1502,0.5087,27000.0,11490
"Abilene, TX",Hardin-Simmons University,Abilene,TX,0.0,...,0.5547,0.0982,38700.0,25864


In [48]:
college.loc['Miami, FL'].head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
"Miami, FL",New Professions Technical Institute,Miami,FL,0.0,...,0.678,0.8358,18700,8682
"Miami, FL",Management Resources College,Miami,FL,0.0,...,0.5458,0.8698,PrivacySuppressed,12182
"Miami, FL",Strayer University-Doral,Miami,FL,,...,,,49200,36173.5
"Miami, FL",Keiser University- Miami,Miami,FL,,...,,,29700,26063
"Miami, FL",George T Baker Aviation Technical College,Miami,FL,0.0,...,0.0,0.4366,38600,PrivacySuppressed


In [49]:
%%timeit
crit1 = college['CITY'] == 'Miami'
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]

981 µs ± 5.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [50]:
%timeit college.loc['Miami, FL']

34.3 µs ± 418 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
