In [1]:
import pandas as pd
import numpy as np

### Data Series

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

In [3]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [4]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
data[0:2]

0    0.25
1    0.50
dtype: float64

In [7]:
# different index values
data_abcd = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [8]:
data_abcd, data_abcd['b']

(a    0.25
 b    0.50
 c    0.75
 d    1.00
 dtype: float64,
 0.5)

In [9]:
population_dict = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 19552860
}

In [10]:
population = pd.Series(population_dict)

In [11]:
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      19552860
dtype: int64

In [12]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}

In [13]:
area = pd.Series(area_dict)

In [14]:
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [15]:
states = pd.DataFrame({
    'Population': population,
    'area': area
})

In [16]:
states

Unnamed: 0,Population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,19552860,149995


In [17]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [18]:
states.columns

Index(['Population', 'area'], dtype='object')

In [19]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

### Data Indexing and Selection

####  Analyzing a Movie Dataset (Indexing and Selection)

In [20]:
# importing datasets
movies = pd.read_csv("./data/movies.csv")
links = pd.read_csv("./data/links.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

In [21]:
years = movies['title'].str.extract(r'\((\d{4})\)')
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
user_votes = ratings.groupby('movieId')['userId'].count().reset_index(name='votes')

In [22]:
data = pd.DataFrame({
    'movie_id': movies['movieId'],
    'title': movies['title'].str.replace(r'\((\d{4})\)', '', regex=True),
    'year': years[0],
    'genre': movies['genres'],
    'rating': avg_ratings.set_index('movieId')['rating'],
    'votes': user_votes.set_index('movieId')['votes']
})

In [23]:
# convert year to numeric and handle any errors
data['year'] = pd.to_numeric(data['year'], errors='coerce')

In [24]:
data = data.reset_index(drop=True)

In [25]:
data.head()

Unnamed: 0,movie_id,title,year,genre,rating,votes
0,1.0,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,,
1,2.0,Jumanji,1995.0,Adventure|Children|Fantasy,3.92093,215.0
2,3.0,Grumpier Old Men,1995.0,Comedy|Romance,3.431818,110.0
3,4.0,Waiting to Exhale,1995.0,Comedy|Drama|Romance,3.259615,52.0
4,5.0,Father of the Bride Part II,1995.0,Comedy,2.357143,7.0


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14076 entries, 0 to 14075
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   movie_id  9742 non-null   float64
 1   title     9742 non-null   object 
 2   year      9729 non-null   float64
 3   genre     9742 non-null   object 
 4   rating    9724 non-null   float64
 5   votes     9724 non-null   float64
dtypes: float64(4), object(2)
memory usage: 659.9+ KB


In [27]:
# the first 5 movies
data.head()

Unnamed: 0,movie_id,title,year,genre,rating,votes
0,1.0,Toy Story,1995.0,Adventure|Animation|Children|Comedy|Fantasy,,
1,2.0,Jumanji,1995.0,Adventure|Children|Fantasy,3.92093,215.0
2,3.0,Grumpier Old Men,1995.0,Comedy|Romance,3.431818,110.0
3,4.0,Waiting to Exhale,1995.0,Comedy|Drama|Romance,3.259615,52.0
4,5.0,Father of the Bride Part II,1995.0,Comedy,2.357143,7.0


In [28]:
# the last 3 movies
data[-3:]

Unnamed: 0,movie_id,title,year,genre,rating,votes
14073,,,,,3.5,1.0
14074,,,,,3.5,1.0
14075,,,,,4.0,1.0


In [29]:
# select every other movie starting from the 2nd one
data[1::2]

Unnamed: 0,movie_id,title,year,genre,rating,votes
1,2.0,Jumanji,1995.0,Adventure|Children|Fantasy,3.920930,215.0
3,4.0,Waiting to Exhale,1995.0,Comedy|Drama|Romance,3.259615,52.0
5,6.0,Heat,1995.0,Action|Crime|Thriller,3.071429,49.0
7,8.0,Tom and Huck,1995.0,Adventure|Children,3.185185,54.0
9,10.0,GoldenEye,1995.0,Action|Adventure|Thriller,3.125000,16.0
...,...,...,...,...,...,...
14067,,,,,3.000000,1.0
14069,,,,,4.000000,1.0
14071,,,,,4.000000,1.0
14073,,,,,3.500000,1.0


In [30]:
# select all information for the movie with 'movie_id' 100
data[data.movie_id == 100]

Unnamed: 0,movie_id,title,year,genre,rating,votes
88,100.0,City Hall,1996.0,Drama|Thriller,3.15625,16.0


In [31]:
# select the title and rating for movies with 'movie_id' 100, 200, and 300
data[data.movie_id.isin([100, 200, 300])][['title', 'rating']]

Unnamed: 0,title,rating
88,City Hall,3.15625
260,Quiz Show,4.231076


In [32]:
#  select all movies with a rating higher than 4.0
data[data.rating > 4.0]

Unnamed: 0,movie_id,title,year,genre,rating,votes
28,29.0,"City of Lost Children, The (Cité des enfants p...",1995.0,Adventure|Drama|Fantasy|Mystery|Sci-Fi,4.227273,11.0
29,30.0,Shanghai Triad (Yao a yao yao dao waipo qiao),1995.0,Crime|Drama,4.013158,38.0
40,44.0,Mortal Kombat,1995.0,Action|Adventure|Fantasy,4.250000,2.0
50,55.0,Georgia,1995.0,Drama,4.237745,204.0
53,60.0,"Indian in the Cupboard, The",1995.0,Adventure|Children|Fantasy,5.000000,2.0
...,...,...,...,...,...,...
14045,,,,,5.000000,1.0
14046,,,,,4.500000,1.0
14049,,,,,4.500000,1.0
14051,,,,,4.500000,1.0


In [55]:
# select all movies from the year 2000 or later
data[data.year > 2000]

Unnamed: 0,movie_id,title,year,genre,rating,votes
3028,4052.0,Antitrust,2001.0,Crime|Drama|Thriller,2.500000,2.0
3029,4053.0,Double Take,2001.0,Action|Comedy,3.250000,6.0
3030,4054.0,Save the Last Dance,2001.0,Drama|Romance,4.230769,13.0
3032,4056.0,"Pledge, The",2001.0,Crime|Drama|Mystery|Thriller,3.666667,6.0
3040,4068.0,Sugar & Spice,2001.0,Comedy,3.222222,9.0
...,...,...,...,...,...,...
9736,193579.0,Jon Stewart Has Left the Building,2015.0,Documentary,,
9737,193581.0,Black Butler: Book of the Atlantic,2017.0,Action|Animation|Comedy|Fantasy,,
9738,193583.0,No Game No Life: Zero,2017.0,Animation|Comedy|Fantasy,,
9739,193585.0,Flint,2017.0,Drama,,


In [56]:
data.genre.isnull().sum()

4334

In [57]:
# select all movies that have 'Action' in their genre
action_movies = data[data.genre.fillna('').str.contains('Action', case=False, na=False)]

In [58]:
action_movies

Unnamed: 0,movie_id,title,year,genre,rating,votes
5,6.0,Heat,1995.0,Action|Crime|Thriller,3.071429,49.0
8,9.0,Sudden Death,1995.0,Action,2.875000,8.0
9,10.0,GoldenEye,1995.0,Action|Adventure|Thriller,3.125000,16.0
14,15.0,Cutthroat Island,1995.0,Action|Adventure|Romance,3.833333,18.0
19,20.0,Money Train,1995.0,Action|Comedy|Crime|Drama|Thriller,2.727273,88.0
...,...,...,...,...,...,...
9722,189547.0,Iron Soldier,2010.0,Action|Sci-Fi,,
9731,191005.0,Gintama,2017.0,Action|Adventure|Comedy|Sci-Fi,,
9732,193565.0,Gintama: The Movie,2010.0,Action|Animation|Comedy|Sci-Fi,,
9737,193581.0,Black Butler: Book of the Atlantic,2017.0,Action|Animation|Comedy|Fantasy,,


In [76]:
# Select the 'title' and 'year' of the top 5 highest-rated movies
data.sort_values('rating', ascending=False).head(5)[['title', 'year']]

Unnamed: 0,title,year
11990,,
12441,,
13417,,
13428,,
13430,,
