In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer

In [3]:
df_imdb = pd.read_csv("04_imdb.csv", index_col = 0)

In [4]:
df_imdb.shape

(88, 12)

In [5]:
df_imdb.columns

Index(['color', 'director_name', 'duration', 'gross', 'genres', 'movie_title',
       'title_year', 'country', 'budget', 'imdb_score', 'actors',
       'movie_facebook_likes'],
      dtype='object')

In [6]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 0 to 98
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   color                 88 non-null     object 
 1   director_name         88 non-null     object 
 2   duration              88 non-null     int64  
 3   gross                 88 non-null     float64
 4   genres                88 non-null     object 
 5   movie_title           88 non-null     object 
 6   title_year            88 non-null     int64  
 7   country               88 non-null     object 
 8   budget                88 non-null     float64
 9   imdb_score            88 non-null     float64
 10  actors                88 non-null     object 
 11  movie_facebook_likes  88 non-null     int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 8.9+ KB


In [8]:
df_imdb.color.value_counts()

Color              87
Black and white     1
Name: color, dtype: int64

In [9]:
df_imdb.head(3)

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,actors,movie_facebook_likes
0,Color,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf Of Wall Street,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,Color,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,Color,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000


In [10]:
# indexing
encoder = LabelEncoder()
encoded_colour = encoder.fit_transform(df_imdb.color)
print(encoded_colour)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [11]:
df_imdb.color = encoded_colour
df_imdb.head(3)

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,actors,movie_facebook_likes
0,1,Martin Scorsese,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf Of Wall Street,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,1,Shane Black,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,1,Quentin Tarantino,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000


In [13]:
df_imdb.director_name.nunique()

60

In [14]:
# lets assume our target is imdb_score
# fit 
# mean = df.grouby(['subject'])['target'].mean().to_dict() 
# df should be only training data
mean_encoded = df_imdb.groupby(['director_name'])['imdb_score'].mean().to_dict()
mean_encoded

{'Adam McKay': 6.3,
 'Adam Shankman': 5.9,
 'Angelina Jolie Pitt': 7.2,
 'Anthony Russo': 7.8,
 'Baz Luhrmann': 7.3,
 'Bryan Singer': 7.65,
 'Christopher Nolan': 8.633333333333333,
 'Clint Eastwood': 6.9,
 'Darren Aronofsky': 5.8,
 'David Ayer': 7.6,
 'David Dobkin': 7.4,
 'David Fincher': 8.1,
 'Denis Villeneuve': 8.1,
 'Derek Cianfrance': 7.3,
 'Edward Hall': 7.2,
 'F. Gary Gray': 7.9,
 'Francis Lawrence': 7.6,
 'Gary Ross': 6.7,
 "Gavin O'Connor": 8.2,
 'Gnana Rajasekaran': 7.0,
 'Gore Verbinski': 6.5,
 'Guillaume Canet': 6.5,
 'James Mangold': 6.7,
 'James Wan': 7.2,
 'Jay Oliva': 8.4,
 'Joss Whedon': 8.1,
 'Justin Chadwick': 7.1,
 'Kathryn Bigelow': 7.4,
 'Kenneth Lonergan': 6.5,
 'Marc Webb': 6.85,
 'Martin Scorsese': 8.149999999999999,
 'Michael Bay': 6.466666666666666,
 'Michael Patrick King': 4.3,
 'Mike Leigh': 6.8,
 'No info': 6.972727272727273,
 'Oliver Stone': 6.5,
 'Paul Greengrass': 7.9,
 'Paul Thomas Anderson': 6.9,
 'Peter Jackson': 7.766666666666667,
 'Quentin Taranti

In [15]:
# transform
# df['subject'] = df['subject'].map(mean)
# do it once with training, do it again with testing
df_imdb['director_name'] = df_imdb['director_name'].map(mean_encoded)
df_imdb.head(5)

Unnamed: 0,color,director_name,duration,gross,genres,movie_title,title_year,country,budget,imdb_score,actors,movie_facebook_likes
0,1,8.15,240,116866727.0,Biography|Comedy|Crime|Drama,The Wolf Of Wall Street,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000
1,1,7.2,195,408992272.0,Action|Adventure|Sci-Fi,Iron Man 3,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000
2,1,8.2,187,54116191.0,Crime|Drama|Mystery|Thriller|Western,The Hateful Eight,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000
3,1,6.5,186,46495.0,Drama,Margaret,2011,United States,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0
4,1,7.766667,186,258355354.0,Adventure|Fantasy,The Hobbit: The Desolation Of Smaug,2013,United States,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000


In [16]:
df_imdb['genres'].value_counts()

Action|Adventure|Sci-Fi                     7
Drama                                       5
Crime|Drama|Thriller                        4
Biography|Drama|History                     4
Adventure|Fantasy                           3
Drama|Romance                               3
Crime|Drama|Mystery|Thriller                3
Action|Adventure|Thriller                   3
Action|Adventure|Fantasy                    2
Biography|Drama|Sport|War                   2
Action|Adventure|Fantasy|Sci-Fi             2
Action|Adventure|Sci-Fi|Thriller            2
Action|Adventure|Drama                      2
Adventure|Drama|History                     2
Crime|Drama                                 2
Adventure|Drama|Sci-Fi                      2
Drama|History|Thriller                      2
Adventure|Sci-Fi                            1
Comedy|Drama                                1
Action|Drama|War                            1
Action|Crime|Thriller                       1
Biography|Drama|Thriller          

In [17]:
df_imdb.genres.str.split("|").explode()

0     Biography
0        Comedy
0         Crime
0         Drama
1        Action
        ...    
97          War
98    Biography
98        Drama
98        Music
98      Musical
Name: genres, Length: 262, dtype: object

In [18]:
df_imdb.genres.str.split("|").explode().value_counts()

Drama        59
Adventure    33
Action       28
Thriller     27
Sci-Fi       19
Crime        16
Biography    15
History      13
Fantasy       9
War           7
Comedy        7
Romance       7
Mystery       6
Western       5
Musical       3
Music         3
Sport         3
Animation     1
Horror        1
Name: genres, dtype: int64

In [19]:
df_imdb.genres.str.split("|").explode().nunique()

19

In [20]:
df_imdb.genres.str.split("|").to_list()

[['Biography', 'Comedy', 'Crime', 'Drama'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Crime', 'Drama', 'Mystery', 'Thriller', 'Western'],
 ['Drama'],
 ['Adventure', 'Fantasy'],
 ['Adventure', 'Fantasy'],
 ['Drama', 'Romance'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Drama', 'Sci-Fi'],
 ['Crime', 'Drama', 'Mystery', 'Thriller'],
 ['Adventure', 'Drama', 'Sci-Fi'],
 ['Biography', 'Crime', 'Drama', 'History', 'Music'],
 ['Drama'],
 ['Drama', 'Western'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Action', 'Thriller'],
 ['Adventure', 'Fantasy'],
 ['Drama', 'Musical', 'Romance'],
 ['Drama', 'History', 'Thriller'],
 ['Action', 'Adventure', 'Drama', 'History'],
 ['Adventure', 'Drama', 'Thriller', 'Western'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Crime', 'Drama', 'Mystery', 'Thriller'],
 ['Biography', 'Drama', 'History'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Adventure', 'Drama', 'Sci-Fi'],
 ['Action', 'Adventure', 'Drama'],
 ['Biography', 'Drama', 'History', 'War'],
 ['Biography', 'Drama', 'History'

In [21]:
encoder = MultiLabelBinarizer()
encoder.fit_transform(df_imdb.genres.str.split('|').to_list())

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
encoder.classes_

array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
       'Drama', 'Fantasy', 'History', 'Horror', 'Music', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War',
       'Western'], dtype=object)

In [23]:
df = pd.DataFrame(encoder.fit_transform(df_imdb.genres.str.split('|').to_list()), columns=encoder.classes_)
df.head(5)

Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,Fantasy,History,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,1
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [24]:
df_imdb = df_imdb.reset_index(drop=True).join(df)
df_imdb = df_imdb.drop(columns='genres')
df_imdb.head()

Unnamed: 0,color,director_name,duration,gross,movie_title,title_year,country,budget,imdb_score,actors,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,1,8.15,240,116866727.0,The Wolf Of Wall Street,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",...,0,0,0,0,0,0,0,0,0,0
1,1,7.2,195,408992272.0,Iron Man 3,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",...,0,0,0,0,0,1,0,0,0,0
2,1,8.2,187,54116191.0,The Hateful Eight,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",...,0,0,0,1,0,0,0,1,0,1
3,1,6.5,186,46495.0,Margaret,2011,United States,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",...,0,0,0,0,0,0,0,0,0,0
4,1,7.766667,186,258355354.0,The Hobbit: The Desolation Of Smaug,2013,United States,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",...,0,0,0,0,0,0,0,0,0,0


In [25]:
df_imdb.movie_title.nunique()

86

In [28]:
df_imdb[df_imdb.movie_title.duplicated()==True]

Unnamed: 0,color,director_name,duration,gross,movie_title,title_year,country,budget,imdb_score,actors,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
54,1,6.05,141,106160295.0,Ben-Hur,2016,United States,100000000.0,6.0,"Morgan Freeman,Ayelet Zurer,Moises Arias",...,0,0,0,0,0,0,0,0,0,0
75,1,7.2,137,115603980.0,Unbroken,2014,United States,65000000.0,7.2,"Finn Wittrock,Jack O'Connell,Alex Russell",...,0,0,0,0,0,0,1,0,1,0


In [29]:
df_imdb[df_imdb.movie_title.isin(['Ben-Hur', 'Unbroken'])]

Unnamed: 0,color,director_name,duration,gross,movie_title,title_year,country,budget,imdb_score,actors,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
53,1,6.05,141,106160295.0,Ben-Hur,2016,United States,100000000.0,6.1,"Morgan Freeman,Ayelet Zurer,Moises Arias",...,0,0,0,0,0,0,0,0,0,0
54,1,6.05,141,106160295.0,Ben-Hur,2016,United States,100000000.0,6.0,"Morgan Freeman,Ayelet Zurer,Moises Arias",...,0,0,0,0,0,0,0,0,0,0
74,1,7.2,137,115603980.0,Unbroken,2014,United States,65000000.0,7.2,"Finn Wittrock,Jack O'Connell,Alex Russell",...,0,0,0,0,0,0,1,0,1,0
75,1,7.2,137,115603980.0,Unbroken,2014,United States,65000000.0,7.2,"Finn Wittrock,Jack O'Connell,Alex Russell",...,0,0,0,0,0,0,1,0,1,0


In [33]:
df_imdb.drop_duplicates(subset=['movie_title'], keep = 'first', inplace=True)

In [34]:
df_imdb.drop('movie_title', axis=1, inplace=True)

In [35]:
df_imdb.title_year.value_counts()

2014    20
2012    19
2013    17
2011    10
2015     8
2010     7
2016     5
Name: title_year, dtype: int64

In [36]:
df_imdb.title_year = df_imdb.title_year.astype(pd.CategoricalDtype(ordered=True))
df_imdb.title_year.dtype

CategoricalDtype(categories=[2010, 2011, 2012, 2013, 2014, 2015, 2016], ordered=True)

In [37]:
df_imdb.title_year

0     2013
1     2013
2     2015
3     2011
4     2013
      ... 
83    2013
84    2010
85    2013
86    2014
87    2014
Name: title_year, Length: 86, dtype: category
Categories (7, int64): [2010 < 2011 < 2012 < 2013 < 2014 < 2015 < 2016]

In [38]:
df_imdb.country.nunique()

9

In [43]:
reshaped_country = np.array(df_imdb.country).reshape(len(df_imdb.country), 1)
reshaped_country

array([['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United Kingdom'],
       ['United States'],
       ['Germany'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['New Zealand'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['India'],
       ['United States'],
       ['United States'],
       ['United Kingdom'],
       ['United States'],
       ['United Kingdom'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United States'],
       ['United Kingdom'],
       ['United States'],
       ['United States'],
       ['United States']

In [44]:
encoder = OneHotEncoder(sparse=False)
encoder.fit_transform(reshaped_country)

array([[0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0.

In [45]:
df = pd.DataFrame(encoder.fit_transform(reshaped_country), columns=list(encoder.categories_))
df.head(5)

Unnamed: 0,Australia,Canada,France,Germany,India,Kyrgyzstan,New Zealand,United Kingdom,United States
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [46]:
df.columns

MultiIndex([(     'Australia',),
            (        'Canada',),
            (        'France',),
            (       'Germany',),
            (         'India',),
            (    'Kyrgyzstan',),
            (   'New Zealand',),
            ('United Kingdom',),
            ( 'United States',)],
           )

In [47]:
df.columns = [multi_col[0] for multi_col in df.columns]
df.columns 

Index(['Australia', 'Canada', 'France', 'Germany', 'India', 'Kyrgyzstan',
       'New Zealand', 'United Kingdom', 'United States'],
      dtype='object')

In [48]:
df_imdb = df_imdb.reset_index(drop=True).join(df)
df_imdb.head()

Unnamed: 0,color,director_name,duration,gross,title_year,country,budget,imdb_score,actors,movie_facebook_likes,...,Western,Australia,Canada,France,Germany,India,Kyrgyzstan,New Zealand,United Kingdom,United States
0,1,8.15,240,116866727.0,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,7.2,195,408992272.0,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,8.2,187,54116191.0,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000,...,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,6.5,186,46495.0,2011,United States,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,7.766667,186,258355354.0,2013,United States,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# another way is to use pd.get_dummies
#https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [49]:
df_imdb.actors.value_counts()

Aidan Turner,Adam Brown,James Nesbitt                3
Leonardo DiCaprio,Matthew McConaughey,Jon Favreau    1
Demián Bichir,Shea Whigham,Gary Stretch              1
Brad Pitt,Tye Sheridan,Fiona Shaw                    1
Tika Sumpter,Josh Hopkins,Aunjanue Ellis             1
                                                    ..
Johnny Depp,Ruth Wilson,Tom Wilkinson                1
Lesley Manville,Ruth Sheen,Karl Johnson              1
Joseph Gordon-Levitt,Hal Holbrook,Bruce McGill       1
Christian Bale,María Valverde,Ben Mendelsohn         1
Johnny Cannizzaro,Steve Schirripa,Scott Vance        1
Name: actors, Length: 84, dtype: int64

In [50]:
df_imdb.actors.str.split(",").explode().nunique()

199

In [51]:
# 2 steps
# break into actor 1 actor 2 actor 3
df = df_imdb['actors'].str.split(',', n=3, expand=True)
df.head()

Unnamed: 0,0,1,2
0,Leonardo DiCaprio,Matthew McConaughey,Jon Favreau
1,Robert Downey Jr.,Jon Favreau,Don Cheadle
2,Craig Stark,Jennifer Jason Leigh,Zoë Bell
3,Matt Damon,Kieran Culkin,John Gallagher Jr.
4,Aidan Turner,Adam Brown,James Nesbitt


In [52]:
df_imdb['actor_1'] = df[0]
df_imdb['actor_2'] = df[1]
df_imdb['actor_3'] = df[2]
df_imdb.head()

Unnamed: 0,color,director_name,duration,gross,title_year,country,budget,imdb_score,actors,movie_facebook_likes,...,France,Germany,India,Kyrgyzstan,New Zealand,United Kingdom,United States,actor_1,actor_2,actor_3
0,1,8.15,240,116866727.0,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Leonardo DiCaprio,Matthew McConaughey,Jon Favreau
1,1,7.2,195,408992272.0,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Robert Downey Jr.,Jon Favreau,Don Cheadle
2,1,8.2,187,54116191.0,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Craig Stark,Jennifer Jason Leigh,Zoë Bell
3,1,6.5,186,46495.0,2011,United States,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Matt Damon,Kieran Culkin,John Gallagher Jr.
4,1,7.766667,186,258355354.0,2013,United States,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Aidan Turner,Adam Brown,James Nesbitt


In [53]:
# index actors
encoder = LabelEncoder()
encoder.fit(df_imdb.actors.str.split(',').explode().unique())

LabelEncoder()

In [59]:
encoder.classes_
encoder.inverse_transform([0])

array(['Adam Brown'], dtype=object)

In [54]:
encoder.transform(df_imdb.actor_1)

array([114, 157,  42, 131,   1,   1, 161,  35, 189, 159, 132,   4,  54,
       114,  23, 190,   1,  74,  83, 126, 114,  64,  74, 125,  56, 131,
        39,  98, 115,  96, 148,  85, 134, 114, 129,  40, 157,  37,  56,
        85,  86, 188, 139, 137,  85,  69,  73,   3, 114, 156, 189,  56,
        85, 142, 186,  44, 157,  78,  81, 164, 190, 132, 187,  24,  85,
        45,  10, 114, 135,  74, 114, 111,  85,  59, 117, 167,  96,  79,
        62,  52,  87, 153, 126, 189,  24,  95])

In [55]:
df_imdb.actor_1 = encoder.transform(df_imdb.actor_1)
df_imdb.actor_2 = encoder.transform(df_imdb.actor_2)
df_imdb.actor_3 = encoder.transform(df_imdb.actor_3)
df_imdb.head()

Unnamed: 0,color,director_name,duration,gross,title_year,country,budget,imdb_score,actors,movie_facebook_likes,...,France,Germany,India,Kyrgyzstan,New Zealand,United Kingdom,United States,actor_1,actor_2,actor_3
0,1,8.15,240,116866727.0,2013,United States,100000000.0,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,114,132,97
1,1,7.2,195,408992272.0,2013,United States,200000000.0,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,157,97,46
2,1,8.2,187,54116191.0,2015,United States,44000000.0,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42,84,198
3,1,6.5,186,46495.0,2011,United States,14000000.0,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,131,109,94
4,1,7.766667,186,258355354.0,2013,United States,225000000.0,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,80


In [None]:
# make sure you drop the original actor column

# can also use mean - but captures contribution of other 2 actors as well

In [56]:
df_imdb

Unnamed: 0,color,director_name,duration,gross,title_year,country,budget,imdb_score,actors,movie_facebook_likes,...,France,Germany,India,Kyrgyzstan,New Zealand,United Kingdom,United States,actor_1,actor_2,actor_3
0,1,8.150000,240,116866727.0,2013,United States,1.000000e+08,8.2,"Leonardo DiCaprio,Matthew McConaughey,Jon Favreau",138000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,114,132,97
1,1,7.200000,195,408992272.0,2013,United States,2.000000e+08,7.2,"Robert Downey Jr.,Jon Favreau,Don Cheadle",95000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,157,97,46
2,1,8.200000,187,54116191.0,2015,United States,4.400000e+07,7.9,"Craig Stark,Jennifer Jason Leigh,Zoë Bell",114000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,42,84,198
3,1,6.500000,186,46495.0,2011,United States,1.400000e+07,6.5,"Matt Damon,Kieran Culkin,John Gallagher Jr.",0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,131,109,94
4,1,7.766667,186,258355354.0,2013,United States,2.250000e+08,7.9,"Aidan Turner,Adam Brown,James Nesbitt",83000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,1,8.100000,134,56667870.0,2013,United States,2.000000e+07,8.1,"Quvenzhané Wallis,Scoot McNairy,Taran Killam",83000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,153,168,185
82,1,7.300000,134,7501404.0,2010,Canada,1.030833e+08,7.3,"Mark Addy,Atom Egoyan,Paul Gross",0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,126,13,149
83,1,7.900000,134,107100855.0,2013,United States,5.500000e+07,7.9,"Tom Hanks,Chris Mulkey,Michael Chernus",65000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,189,36,133
84,1,7.600000,134,85707116.0,2014,United States,6.800000e+07,7.6,"Brad Pitt,Logan Lerman,Jim Parrack",82000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,24,121,90
