In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
pd.set_option('max_columns', 8, 'max_rows', 20)

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Selecting Multiple Columns

In [4]:
movie = pd.read_csv('data/movie.csv')
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


In [5]:
movie_actor_director = movie[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']]
type(movie_actor_director)
movie_actor_director.head()

pandas.core.frame.DataFrame

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [6]:
type(movie[['director_name']])
movie[['director_name']].head()

pandas.core.frame.DataFrame

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


In [7]:
req_cols = ['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
movie_actor_director = movie[req_cols]
movie_actor_director

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker
...,...,...,...,...
4911,Eric Mabius,Daphne Zuniga,Crystal Lowe,Scott Smith
4912,Natalie Zea,Valorie Curry,Sam Underwood,
4913,Eva Boehnke,Maxwell Moody,David Chandler,Benjamin Roberds
4914,Alan Ruck,Daniel Henney,Eliza Coupe,Daniel Hsia


In [8]:
movie_numeric_data = movie.select_dtypes(include=['number']).head(1)  # all numeric columns

In [9]:
movie_numeric_data.dtypes

num_critic_for_reviews       float64
duration                     float64
director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_1_facebook_likes       float64
gross                        float64
num_voted_users                int64
cast_total_facebook_likes      int64
facenumber_in_poster         float64
num_user_for_reviews         float64
budget                       float64
title_year                   float64
actor_2_facebook_likes       float64
imdb_score                   float64
aspect_ratio                 float64
movie_facebook_likes           int64
dtype: object

#### filtering columns with name

In [10]:
movie_fb_data = movie.filter(like='facebook').head(1)

In [11]:
movie_fb_data.dtypes

director_facebook_likes      float64
actor_3_facebook_likes       float64
actor_1_facebook_likes       float64
cast_total_facebook_likes      int64
actor_2_facebook_likes       float64
movie_facebook_likes           int64
dtype: object

In [12]:
movie.filter(regex='\d').head(1).dtypes   

actor_3_facebook_likes    float64
actor_2_name               object
actor_1_facebook_likes    float64
actor_1_name               object
actor_3_name               object
actor_2_facebook_likes    float64
dtype: object

In [13]:
movie.columns
len(movie.columns)

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

28

In [14]:
disc_core = ['movie_title', 'title_year','content_rating', 'genres']
disc_people = ['director_name', 'actor_1_name','actor_2_name', 'actor_3_name']
disc_other = ['color', 'country', 'language','plot_keywords', 'movie_imdb_link']
cont_fb = ['director_facebook_likes', 'actor_1_facebook_likes', 'actor_2_facebook_likes', 'actor_3_facebook_likes','cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget', 'gross']
cont_num_reviews = ['num_voted_users', 'num_user_for_reviews','num_critic_for_reviews']
cont_other = ['imdb_score', 'duration', 'aspect_ratio', 'facenumber_in_poster']

In [15]:
new_col_order = disc_core + disc_people + \
                    disc_other + cont_fb + \
                    cont_finance + cont_num_reviews + \
                    cont_other

In [16]:
set(movie.columns) == set(new_col_order)

True

In [17]:
movie2 = movie[new_col_order]
movie2.head(3)

Unnamed: 0,movie_title,title_year,content_rating,genres,...,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,...,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,...,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,...,6.8,148.0,2.35,1.0


In [18]:
movie.head(3)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000


In [19]:
movie.shape

(4916, 28)

In [20]:
movie.size   # rows * cols

137648

In [21]:
movie.ndim

2

In [22]:
len(movie) # total rows

4916

In [23]:
movie.count()

color                      4897
director_name              4814
num_critic_for_reviews     4867
duration                   4901
director_facebook_likes    4814
                           ... 
title_year                 4810
actor_2_facebook_likes     4903
imdb_score                 4916
aspect_ratio               4590
movie_facebook_likes       4916
Length: 28, dtype: int64

In [24]:
movie.select_dtypes(include=['number']).min()

num_critic_for_reviews          1.00
duration                        7.00
director_facebook_likes         0.00
actor_3_facebook_likes          0.00
actor_1_facebook_likes          0.00
gross                         162.00
num_voted_users                 5.00
cast_total_facebook_likes       0.00
facenumber_in_poster            0.00
num_user_for_reviews            1.00
budget                        218.00
title_year                   1916.00
actor_2_facebook_likes          0.00
imdb_score                      1.60
aspect_ratio                    1.18
movie_facebook_likes            0.00
dtype: float64

In [25]:
movie.min(skipna=False)

num_critic_for_reviews                                                     NaN
duration                                                                   NaN
director_facebook_likes                                                    NaN
actor_3_facebook_likes                                                     NaN
actor_1_facebook_likes                                                     NaN
gross                                                                      NaN
genres                                                                  Action
movie_title                                                            #Horror
num_voted_users                                                              5
cast_total_facebook_likes                                                    0
facenumber_in_poster                                                       NaN
movie_imdb_link              http://www.imdb.com/title/tt0006864/?ref_=fn_t...
num_user_for_reviews                                

In [26]:
movie.describe().head(2)

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,...,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,...,1621.923516,6.437429,2.222349,7348.294142


### Method chaning in DataFrame

In [27]:
movie.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0


In [28]:
movie.isnull().head(5)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,False,False,False,False,...,False,False,False,False
1,False,False,False,False,...,False,False,False,False
2,False,False,False,False,...,False,False,False,False
3,False,False,False,False,...,False,False,False,False
4,True,False,True,True,...,False,False,True,False


In [29]:
movie.isnull().sum() # no. of missing values for each column

color                       19
director_name              102
num_critic_for_reviews      49
duration                    15
director_facebook_likes    102
                          ... 
title_year                 106
actor_2_facebook_likes      13
imdb_score                   0
aspect_ratio               326
movie_facebook_likes         0
Length: 28, dtype: int64

In [30]:
movie.isnull().sum().sum() # total missing values in movie

2654

In [31]:
movie.fillna(0).isnull().sum()

color                      0
director_name              0
num_critic_for_reviews     0
duration                   0
director_facebook_likes    0
                          ..
title_year                 0
actor_2_facebook_likes     0
imdb_score                 0
aspect_ratio               0
movie_facebook_likes       0
Length: 28, dtype: int64

In [32]:
movie.fillna(0).isnull().sum().sum()

0

In [33]:
movie.isnull().any().any()  # if dataframe has missing values

True

### Handling of missing values and comparing data frames

In [34]:
np.nan == np.nan

False

In [35]:
type(np.nan)

float

In [36]:
None == None

True

In [37]:
np.nan == None

False

In [38]:
college = pd.read_csv('data/college.csv', index_col = 'INSTNM')
college.columns

Index(['CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [39]:
college_ugds_ = college.filter(like='UGDS_')

In [40]:
college_ugds_ == 0.0019   # gets applied to every data element in df

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,...,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,...,False,False,False,False
Amridge University,False,False,False,False,...,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,...,False,False,False,False
Alabama State University,False,False,False,True,...,False,False,False,False
...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,...,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,...,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,...,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,...,False,False,False,False


In [41]:
(college_ugds_ == 0.019).any().any()

True

In [42]:
college_compare = (college_ugds_ == college_ugds_)
college_compare.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,...,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,...,True,True,True,True
Amridge University,True,True,True,True,...,True,True,True,True
University of Alabama in Huntsville,True,True,True,True,...,True,True,True,True
Alabama State University,True,True,True,True,...,True,True,True,True


In [43]:
college_compare.all()  # missing values not treated as same in two data frames

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [44]:
(college_ugds_ == np.nan).sum()  # nan == nan is False!!!

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [45]:
college_ugds_.isnull().sum()   # isnull preferred to count missing values 

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

In [46]:
college_ugds_.equals(college_ugds_)

True

In [47]:
college_ugds_.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,...,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,...,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,...,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,...,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,...,0.0006,0.0098,0.0243,0.0137


In [48]:
college_ugds_.mul(100).head()  # in percentage of particualr race

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.33,93.53,0.55,0.19,...,0.19,0.0,0.59,1.38
University of Alabama at Birmingham,59.22,26.0,2.83,5.18,...,0.07,3.68,1.79,1.0
Amridge University,29.9,41.92,0.69,0.34,...,0.0,0.0,0.0,27.15
University of Alabama in Huntsville,69.88,12.55,3.82,3.76,...,0.02,1.72,3.32,3.5
Alabama State University,1.58,92.08,1.21,0.19,...,0.06,0.98,2.43,1.37


In [49]:
college_ugds_.count()

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [50]:
college_ugds_.count(axis=0)

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [51]:
len(college_ugds_.columns)

9

In [52]:
college_ugds_.count(axis='columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [53]:
college_ugds_.sum(axis=1).head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [54]:
college_ugds_.median(axis='index')

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

In [55]:
result = college_ugds_.cumsum(axis=1)
result.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,...,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,...,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,...,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,...,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,...,0.9522,0.962,0.9863,1.0


In [56]:
college_div = pd.read_csv('data/college_diversity.csv', index_col='School')
college_div.head()

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74


In [57]:
college_ugds_.columns

Index(['UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN'],
      dtype='object')

In [63]:
college_ugds_.isnull() \
                .sum(axis=1)\
                .sort_values(ascending=False)\
                .head()

INSTNM
Excel Learning Center-San Antonio South              9
Western State College of Law at Argosy University    9
Albany Law School                                    9
Albany Medical College                               9
A T Still University of Health Sciences              9
dtype: int64

In [69]:
college_ugds_ = college_ugds_.dropna(how='all')  # drop the rows when 'all' values are missing.
college_ugds_.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

#### schools with more than 15% of diversity across all race

In [72]:
diversity_metric = college_ugds_.ge(0.15).sum(axis='columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [73]:
diversity_metric.value_counts()   

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

#### Shools with more diversity across race

In [74]:
diversity_metric.sort_values(ascending=False).head(10)

INSTNM
Central Texas Beauty College-Temple                               5
Regency Beauty Institute-Austin                                   5
Westwood College-O'Hare Airport                                   4
Regency Beauty Institute-Pasadena                                 4
Soma Institute-The National School of Clinical Massage Therapy    4
The Art Institute of Fort Lauderdale                              4
CUNY Brooklyn College                                             4
Virginia College-Austin                                           4
Ambria College of Nursing                                         4
Fortis College-Phoenix                                            4
dtype: int64

#### Check the actual percentage for these two top schools

In [77]:
college_ugds_.loc[['Central Texas Beauty College-Temple', 'Regency Beauty Institute-Austin']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,...,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,...,0.0,0.1717,0.0,0.1515
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,...,0.0,0.1733,0.0,0.2667


#### getting same data for US top schools

In [78]:
us_news_top = ['Rutgers University-Newark',
                   'Andrews University', 
                   'Stanford University', 
                   'University of Houston',
                   'University of Nevada-Las Vegas']

In [79]:
diversity_metric.loc[us_news_top]  # valu indicates the diversity with more than 15%

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64