In [1]:
import pandas as pd

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
movie = pd.read_csv('data/movie.csv', index_col='movie_title')
movie.head(2)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0


In [4]:
movie_duration = movie['duration'] > 120
movie_duration.head(3)

movie_title
Avatar                                      True
Pirates of the Caribbean: At World's End    True
Spectre                                     True
Name: duration, dtype: bool

In [5]:
movie_duration.sum()   # movies more than 2 hours

1039

In [6]:
movie_duration.mean()

0.2113506916192026

In [7]:
movie['duration'].dropna().gt(120).mean()

0.21199755152009794

In [8]:
movie_duration.describe()

count      4916
unique        2
top       False
freq       3877
Name: duration, dtype: object

In [9]:
movie_duration.value_counts()

False    3877
True     1039
Name: duration, dtype: int64

In [10]:
movie_duration.value_counts(normalize=True)

False    0.788649
True     0.211351
Name: duration, dtype: float64

In [11]:
actor_likes = movie[['actor_1_facebook_likes', 'actor_2_facebook_likes']].dropna()
actor_likes

Unnamed: 0_level_0,actor_1_facebook_likes,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avatar,1000.0,936.0
Pirates of the Caribbean: At World's End,40000.0,5000.0
Spectre,11000.0,393.0
The Dark Knight Rises,27000.0,23000.0
Star Wars: Episode VII - The Force Awakens,131.0,12.0
...,...,...
Signed Sealed Delivered,637.0,470.0
The Following,841.0,593.0
A Plague So Pleasant,0.0,0.0
Shanghai Calling,946.0,719.0


In [12]:
likes_count = actor_likes['actor_1_facebook_likes'] > actor_likes['actor_2_facebook_likes']

In [13]:
likes_count.value_counts()

True     4794
False     109
dtype: int64

In [14]:
likes_count.mean()  # % of movies, actor_1 has more fb likes

0.9777687130328371

In [15]:
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = ((movie.title_year < 2000) | (movie.title_year > 2009))

In [16]:
criteria2.head()

movie_title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: content_rating, dtype: bool

In [17]:
final_critirea = criteria1 & criteria2 & criteria3
final_critirea.value_counts() 

False    4893
True       23
dtype: int64

In [18]:
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.title_year < 2000) | (movie.title_year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

In [19]:
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.title_year >= 2000) | (movie.title_year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3

In [20]:
final_crit_all = final_crit_a | final_crit_b

In [21]:
final_crit_all.value_counts()

False    4748
True      168
dtype: int64

In [22]:
final_crit_all.head(10)

movie_title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
John Carter                                   False
Spider-Man 3                                  False
Tangled                                       False
Avengers: Age of Ultron                       False
Harry Potter and the Half-Blood Prince        False
dtype: bool

### using a boolean series as index to data frame, 
### it filters the values of the values only where the series value is true

In [23]:
movie[final_crit_all].head(5)

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
The Avengers,Color,Joss Whedon,703.0,173.0,...,21000.0,8.1,1.85,123000
Captain America: Civil War,Color,Anthony Russo,516.0,147.0,...,19000.0,8.2,2.35,72000
The Lovers,Color,Roland Joffé,10.0,109.0,...,525.0,4.5,,677
Guardians of the Galaxy,Color,James Gunn,653.0,121.0,...,14000.0,8.1,2.35,96000


In [24]:
cols = ['imdb_score', 'content_rating', 'title_year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head(10)

Unnamed: 0_level_0,imdb_score,content_rating,title_year
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight Rises,8.5,PG-13,2012.0
The Avengers,8.1,PG-13,2012.0
Captain America: Civil War,8.2,PG-13,2016.0
The Lovers,4.5,R,2015.0
Guardians of the Galaxy,8.1,PG-13,2014.0
Interstellar,8.6,PG-13,2014.0
Inception,8.8,PG-13,2010.0
The Martian,8.1,PG-13,2015.0
Town & Country,4.4,R,2001.0
Sex and the City 2,4.3,R,2010.0


In [25]:
college = pd.read_csv('data/college.csv')

In [26]:
college[college['STABBR'] == 'TX'].head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3610,Abilene Christian University,Abilene,TX,0.0,...,0.5527,0.0381,40200,25985
3611,Alvin Community College,Alvin,TX,0.0,...,0.0625,0.2841,34500,6750
3612,Amarillo College,Amarillo,TX,0.0,...,0.1573,0.3431,31700,10950
3613,Angelina College,Lufkin,TX,0.0,...,0.0,0.2603,26900,PrivacySuppressed
3614,Angelo State University,San Angelo,TX,0.0,...,0.5279,0.1407,37700,21319.5


In [27]:
college2 = college.set_index('STABBR')
college2.loc['TX'].head()

Unnamed: 0_level_0,INSTNM,CITY,HBCU,MENONLY,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TX,Abilene Christian University,Abilene,0.0,0.0,...,0.5527,0.0381,40200,25985
TX,Alvin Community College,Alvin,0.0,0.0,...,0.0625,0.2841,34500,6750
TX,Amarillo College,Amarillo,0.0,0.0,...,0.1573,0.3431,31700,10950
TX,Angelina College,Lufkin,0.0,0.0,...,0.0,0.2603,26900,PrivacySuppressed
TX,Angelo State University,San Angelo,0.0,0.0,...,0.5279,0.1407,37700,21319.5


#### Boolean indexing takes more time

In [28]:
%timeit college[college['STABBR'] == 'TX']

572 µs ± 9.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [29]:
%timeit college2.loc['TX']

237 µs ± 3.35 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


#### setting index takes time, but it needs to be done only once.

In [30]:
%timeit college2 = college.set_index('STABBR') 

553 µs ± 20.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [31]:
college.head(2) # without index

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,...,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,...,0.5214,0.2422,39700,21941.5


In [32]:
college2.head(2)  # with index

Unnamed: 0_level_0,INSTNM,CITY,HBCU,MENONLY,...,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AL,Alabama A & M University,Normal,1.0,0.0,...,0.8284,0.1049,30300,33888.0
AL,University of Alabama at Birmingham,Birmingham,0.0,0.0,...,0.5214,0.2422,39700,21941.5


In [33]:
college3 = college2.sort_index()
college3.index.is_monotonic

True

In [34]:
%timeit college[college['STABBR'] == 'TX']

562 µs ± 844 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
%timeit college2.loc['TX']

In [None]:
%timeit college3.loc['TX']    # index sorted will be fast in searching

In [None]:
college3.index.is_unique

In [None]:
college_unique = college.set_index('INSTNM')
college_unique.index.is_unique

In [None]:
college[college['INSTNM'] == 'Stanford University']

In [None]:
college_unique.loc['Stanford University']

In [None]:
%timeit college[college['INSTNM'] == 'Stanford University']

### unique and sorted index will be more faster

In [None]:
%timeit college_unique.loc['Stanford University']

In [None]:
### Indexing with mor enumber of columns

In [None]:
college.head(4)

In [None]:
college.index = college['CITY'] + ', ' + college['STABBR']
college = college.sort_index()
college.head()

In [None]:
college.loc['Miami, FL'].head()

In [None]:
%%timeit
crit1 = college['CITY'] == 'Miami'
crit2 = college['STABBR'] == 'FL'
college[crit1 & crit2]

In [None]:
%timeit college.loc['Miami, FL']

### Using SQL types filtering with Data frames

In [None]:
employee = pd.read_csv('data/employee.csv')
employee['DEPARTMENT'].value_counts().head()

In [None]:
employee.GENDER.value_counts()

In [None]:
employee.BASE_SALARY.describe().astype(int)

In [None]:
depts = ['Houston Police Department-HPD', 'Houston Fire Department (HFD)']

In [None]:
criteria_dept = employee.DEPARTMENT.isin(depts)

In [None]:
criteria_gender = employee.GENDER == 'Female'

In [None]:
criteria_sal = (employee.BASE_SALARY >= 80000) & (employee.BASE_SALARY <= 120000)

In [None]:
criteria_final = (criteria_dept & criteria_gender & criteria_sal)

In [None]:
select_columns = ['UNIQUE_ID', 'DEPARTMENT',
                     'GENDER', 'BASE_SALARY']

In [None]:
employee.loc[criteria_final, select_columns]

In [None]:
criteria_sal = employee.BASE_SALARY.between(6000, 12000)


#### get details which not in top5 departments

In [None]:
top_5_depts = employee.DEPARTMENT.value_counts().index[:5]
criteria = ~employee.DEPARTMENT.isin(top_5_depts)
(employee[criteria]).head(5)

In [None]:
employee.head(2)

In [None]:
depts

In [None]:
select_columns

#### depts python variabe used with @ in query

In [None]:
query_str = "DEPARTMENT in @depts " \
                "and GENDER == 'Female' " \
                "and 80000 <= BASE_SALARY <= 120000"

In [None]:
emp_filtered = employee.query(query_str)

In [None]:
emp_filtered[select_columns].head()

In [None]:
#### other way of getting the departments

In [None]:
top10_depts = employee.DEPARTMENT.value_counts().index[:10].tolist()

In [None]:
querystr = "DEPARTMENT not in @top10_depts and GENDER == 'Female'"
emp_filtered = employee.query(query_str)
emp_filtered.head(4)

In [None]:
movie.head()

In [None]:
fb_likes = movie['actor_1_facebook_likes'].dropna()
fb_likes.head()

In [None]:
fb_likes.describe(percentiles=[.1, .25, .5, .75, .9]).astype(int)

In [None]:
fb_likes.hist()

In [None]:
criteria_high = fb_likes < 20000
criteria_high.mean().round(2)

In [None]:
fb_likes.where(criteria_high).head()

In [None]:
fb_likes.where(criteria_high, other=20000).head()

In [None]:
criteira_low = fb_likes > 300
fb_likes_cap = fb_likes.where(criteria_high, other=20000) \
                .where(criteira_low, other=300)

In [None]:
fb_likes_cap.head()

In [None]:
len(fb_likes), len(fb_likes_cap)

In [None]:
fb_likes_cap.hist()

In [None]:
# same as where with conditions
fb_likes_cap2 = fb_likes.clip(lower=300, upper=20000)
fb_likes_cap2.equals(fb_likes_cap)

In [None]:
all_above_2010 = movie['title_year'] >= 2010
all_null = movie['title_year'].isnull()
total_crit = all_above_2010 | all_null

In [None]:
movie.mask(total_crit).head() # makes all applicable NaN

In [None]:
movie_mask = movie.mask(total_crit).dropna(how='all')
movie_mask.head()

In [None]:
## checking same with boolean indexing
movie_boolean = movie[movie['title_year'] < 2010]
movie_mask.equals(movie_boolean)   ## data_types are different

In [None]:
movie_mask.shape == movie_boolean.shape

In [None]:
movie_mask.dtypes == movie_boolean.dtypes

In [None]:
#### boolean index vs mask in time

In [None]:
%timeit movie.mask(criteria).dropna(how='all')

In [None]:
%timeit movie[movie['title_year'] < 2010]

In [None]:
c1 = movie['content_rating'] == 'G'
c2 = movie['imdb_score'] < 4
c = c1 & c2

In [None]:
movie_loc = movie.loc[c]
movie_loc.head()

In [None]:
movie_loc.equals(movie[c])

In [None]:
movie_iloc = movie.iloc[c.values]
movie_iloc.equals(movie[c])

In [None]:
cols = ['content_rating', 'imdb_score', 'title_year', 'gross']
movie.loc[c, cols].sort_values('imdb_score')

In [None]:
col_index = [movie.columns.get_loc(col) for col in cols]
col_index

#### whenver using critieria boolean series for iloc, 
#### use the criteria.values

In [None]:
movie.iloc[c.values, col_index].sort_values('imdb_score')

In [None]:
a = c.values

In [None]:
a

In [None]:
type(a)   # criteria values will be ndarray, criteria will be series

In [None]:
len(a), len(c)