In [2]:
import pandas as pd

# Series (1D)

In [3]:
s = pd.Series([1,2,3,4,5,6])
s

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [4]:
s.describe()

count    6.000000
mean     3.500000
std      1.870829
min      1.000000
25%      2.250000
50%      3.500000
75%      4.750000
max      6.000000
dtype: float64

## Index
- Main difference between Pandas Series and Numpy Array is index in former.
- In that way, Pandas Series is like a middle-ground for lists and dictionaries.

In [5]:
telephone_codes = pd.Series(['011','022','033','044'], index=['Delhi', 'Mumbai', 'Kolkata', 'Madras'])
telephone_codes

Delhi      011
Mumbai     022
Kolkata    033
Madras     044
dtype: object

In [6]:
telephone_codes.describe()

count       4
unique      4
top       033
freq        1
dtype: object

#### Position VS Index

In [7]:
print("At position 0: ", telephone_codes[0])
print("At position 0: ", telephone_codes.iloc[0]) # iloc

print("At index Delhi: ", telephone_codes['Delhi'])
print("At index Delhi: ", telephone_codes.loc['Delhi']) # loc

At position 0:  011
At position 0:  011
At index Delhi:  011
At index Delhi:  011


#### Default Indices

In [8]:
pd.Series([1.1, 34.2, 99.12, 26.78])

0     1.10
1    34.20
2    99.12
3    26.78
dtype: float64

In [9]:
pd.Series([1.1, 34.2, 99.12, 26.78]).idxmax() # argmax

2

## Vectorized Operations
Cases for v1 v2:
- Both have same indices
- Overlapping indices, but not same
- Different indices

In [10]:
v1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
v2 = pd.Series([10,20,30,40], index=['b','c','a','d'])
print(v1, v2, v1+v2, sep='\n')

a    1
b    2
c    3
d    4
dtype: int64
b    10
c    20
a    30
d    40
dtype: int64
a    31
b    12
c    23
d    44
dtype: int64


In [11]:
v1 = pd.Series([1,2,3,4,5], index=['a','b','c','d'])
v2 = pd.Series([10,20,30,40], index=['a','d','e','f'])
print(v1, v2, v1+v2, sep='\n')

a    1
b    2
c    3
d    4
dtype: int64
c    10
d    20
e    30
f    40
dtype: int64
a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64


#### Fill Value and Drop NaN

In [12]:
print("Drop NaN:", (v1+v2).dropna(), sep='\n')
print("Treat missing values as 0 before adding: ", v1.add(v2, fill_value=0), sep='\n')

Drop NaN:
c    13.0
d    24.0
dtype: float64
Treat missing values as 0 before adding: 
a     1.0
b     2.0
c    13.0
d    24.0
e    30.0
f    40.0
dtype: float64


#### Apply

In [14]:
def abbreviate(country):
    return country.upper()[:2]

abbreviate('India')

'IN'

In [15]:
countries = pd.Series(["India", "Canada", "Australia", "Denmark"])
countries.apply(abbreviate)

0    IN
1    CA
2    AU
3    DE
dtype: object

In [16]:
countries.describe()

count          4
unique         4
top       Canada
freq           1
dtype: object

# Dataframe (2D)

In [30]:
pd.DataFrame({
    'Column 1': ['A1','A2','A3'],
    'Column 2': ['B1','B2','B3'],
    'Column 3': ['C1','C2','C3'],
})

Unnamed: 0,Column 1,Column 2,Column 3
0,A1,B1,C1
1,A2,B2,C2
2,A3,B3,C3


In [31]:
pd.DataFrame({
    'Column 1': ['A1','A2','A3'],
    'Column 2': [1,2,3],
    'Column 3': [True,False,True],
},
    index=['row1', 'row2', 'row3'])

Unnamed: 0,Column 1,Column 2,Column 3
row1,A1,1,True
row2,A2,2,False
row3,A3,3,True


In [28]:
pd.DataFrame({
    'Column 1': ['A1','A2','A3'],
    'Column 2': [1,2,3],
    'Column 3': [True,False,True],
},
    index=['row2', 'row2', 'row3'])

Unnamed: 0,Column 1,Column 2,Column 3
row2,A1,1,True
row2,A2,2,False
row3,A3,3,True


In [25]:
_.loc["row2"]

Unnamed: 0,Column 1,Column 2,Column 3
row2,A1,1,True
row2,A2,2,False


In [32]:
_.mean() # underscore variable usage

Column 1    0.333333
Column 2    2.000000
Column 3    0.666667
dtype: float64

In [45]:
movies_df = pd.read_csv('../Datasets/movie_metadata.csv')
movies_df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [46]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-

In [48]:
movies_df.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4993.0,5028.0,4939.0,5020.0,5036.0,4159.0,5043.0,5043.0,5030.0,5022.0,4551.0,4935.0,5030.0,5043.0,4714.0,5043.0
mean,140.194272,107.201074,686.509212,645.009761,6560.047061,48468410.0,83668.16,9699.063851,1.371173,272.770808,39752620.0,2002.470517,1651.754473,6.442138,2.220403,7525.964505
std,121.601675,25.197441,2813.328607,1665.041728,15020.75912,68452990.0,138485.3,18163.799124,2.013576,377.982886,206114900.0,12.474599,4042.438863,1.125116,1.385113,19320.44511
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,50.0,93.0,7.0,133.0,614.0,5340988.0,8593.5,1411.0,0.0,65.0,6000000.0,1999.0,281.0,5.8,1.85,0.0
50%,110.0,103.0,49.0,371.5,988.0,25517500.0,34359.0,3090.0,1.0,156.0,20000000.0,2005.0,595.0,6.6,2.35,166.0
75%,195.0,118.0,194.5,636.0,11000.0,62309440.0,96309.0,13756.5,2.0,326.0,45000000.0,2011.0,918.0,7.2,2.35,3000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,12215500000.0,2016.0,137000.0,9.5,16.0,349000.0


In [53]:
movies_df['duration'].values

array([178., 169., 148., ...,  76., 100.,  90.])

In [54]:
dataset=movies_df.values
print(dataset.shape)

(5043, 28)


In [56]:
movies_df['movie_title'].values

array(['Avatar\xa0', "Pirates of the Caribbean: At World's End\xa0",
       'Spectre\xa0', ..., 'A Plague So Pleasant\xa0',
       'Shanghai Calling\xa0', 'My Date with Drew\xa0'], dtype=object)

In [41]:
movies_df.iloc[100]['movie_title']

'The Fast and the Furious\xa0'

In [57]:
movies_df.set_index('movie_title',inplace=True)

In [58]:
movies_df

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
John Carter,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
Spider-Man 3,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
Tangled,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
Avengers: Age of Ultron,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
Harry Potter and the Half-Blood Prince,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


In [59]:
movies_df.loc['Toy Story\xa0']

color                                                                    Color
director_name                                                    John Lasseter
num_critic_for_reviews                                                     166
duration                                                                    74
director_facebook_likes                                                    487
actor_3_facebook_likes                                                     802
actor_2_name                                                 John Ratzenberger
actor_1_facebook_likes                                                   15000
gross                                                              1.91796e+08
genres                               Adventure|Animation|Comedy|Family|Fantasy
actor_1_name                                                         Tom Hanks
num_voted_users                                                         623757
cast_total_facebook_likes                           