In [1]:
import pandas as pd

In [2]:
df_movies = pd.read_csv('IMDB movies.csv', low_memory=False)
df_ratings = pd.read_csv('IMDB ratings.csv')

In [3]:
# see columns 
# df_movies.columns
df_ratings.columns

Index(['imdb_title_id', 'weighted_average_vote', 'total_votes', 'mean_vote',
       'median_vote', 'votes_10', 'votes_9', 'votes_8', 'votes_7', 'votes_6',
       'votes_5', 'votes_4', 'votes_3', 'votes_2', 'votes_1',
       'allgenders_0age_avg_vote', 'allgenders_0age_votes',
       'allgenders_18age_avg_vote', 'allgenders_18age_votes',
       'allgenders_30age_avg_vote', 'allgenders_30age_votes',
       'allgenders_45age_avg_vote', 'allgenders_45age_votes',
       'males_allages_avg_vote', 'males_allages_votes', 'males_0age_avg_vote',
       'males_0age_votes', 'males_18age_avg_vote', 'males_18age_votes',
       'males_30age_avg_vote', 'males_30age_votes', 'males_45age_avg_vote',
       'males_45age_votes', 'females_allages_avg_vote',
       'females_allages_votes', 'females_0age_avg_vote', 'females_0age_votes',
       'females_18age_avg_vote', 'females_18age_votes',
       'females_30age_avg_vote', 'females_30age_votes',
       'females_45age_avg_vote', 'females_45age_votes',
       

In [4]:
df_movies = df_movies[['imdb_title_id', 'title', 'year', 'genre', 'country']]
df_ratings = df_ratings[['imdb_title_id', 'total_votes', 'mean_vote']]

In [5]:
# show dataframe
# df_movies
df_ratings

Unnamed: 0,imdb_title_id,total_votes,mean_vote
0,tt0000009,154,5.9
1,tt0000574,589,6.3
2,tt0001892,188,6.0
3,tt0002101,446,5.3
4,tt0002130,2237,6.9
...,...,...,...
85850,tt9908390,398,5.5
85851,tt9911196,724,7.9
85852,tt9911774,265,7.8
85853,tt9914286,194,9.4


# 1 Concatenate - concat()

### 1.1 Concatenate vertically

In [6]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                   'age': [30, 23, 25, 22]} )
df2 = pd.DataFrame({'id': ['E', 'F', 'G', 'F'],
                   'age': [40, 21, 19, 24]} )

In [7]:
df2

Unnamed: 0,id,age
0,E,40
1,F,21
2,G,19
3,F,24


In [8]:
pd.concat([df1, df2], axis=0, ignore_index=True)
# ignore_index : ignore the original index and give a new one (numbers)

Unnamed: 0,id,age
0,A,30
1,B,23
2,C,25
3,D,22
4,E,40
5,F,21
6,G,19
7,F,24


#### 1.1.1 Excercise

In [9]:
# Extract a 50% sample of the original dataframe (df_movies)
df_sample = df_movies.sample(frac=0.5, random_state=22)

In [10]:
# shape of dataframes that we'll conactenate 
print(df_sample.shape)
print(df_movies.shape)

(42928, 5)
(85855, 5)


In [11]:
# concatenate the df_movies and df_sample (vertically along the rows)
df_concat_v = pd.concat([df_movies, df_sample], axis=0)

In [12]:
# shape
df_concat_v.shape

(128783, 5)

In [13]:
df_concat_v

Unnamed: 0,imdb_title_id,title,year,genre,country
0,tt0000009,Miss Jerry,1894,Romance,USA
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia
2,tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark"
3,tt0002101,Cleopatra,1912,"Drama, History",USA
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy
...,...,...,...,...,...
65827,tt2261891,Nina,2012,Drama,Italy
12514,tt0060076,Agente Jo Walker operazione Estremo Oriente,1966,Adventure,"Austria, Italy, Yugoslavia, Singapore"
12245,tt0059274,La collina del disonore,1965,"Drama, War",UK
50034,tt0805613,Shoppen,2006,"Comedy, Drama, Romance",Germany


### 1.2 Concatenate Horizontally 

In [14]:
df1 = pd.DataFrame({'id': ['A', 'B', 'C', 'D'],
                   'age': [30, 23, 25, 22]})
df2 = pd.DataFrame({'job': ['Doctor', 'Statistican', 'Accountant', 'Developer']})


In [15]:
pd.concat([df1, df2], axis=1)

# to concat Horizontally

Unnamed: 0,id,age,job
0,A,30,Doctor
1,B,23,Statistican
2,C,25,Accountant
3,D,22,Developer


#### 1.2.1 Excercise

In [19]:
# shape of dataframes that we'll concatenate
print(df_movies.shape)
print(df_ratings.shape)

(85855, 5)
(85855, 3)


In [28]:
# df_movies.set_index('imdb_title_id', inplace=True)
df_ratings.set_index('imdb_title_id', inplace=True)

In [32]:
# concatenate df_movies and df_ratings on 'imdb_title_id' (horizontally)
df_concat_h = pd.concat([df_movies, df_ratings], axis=1)

In [33]:
# shape
print(df_concat_h.shape)

(85855, 6)


In [34]:
df_concat_h

Unnamed: 0_level_0,title,year,genre,country,total_votes,mean_vote
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0000009,Miss Jerry,1894,Romance,USA,154,5.9
tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",Australia,589,6.3
tt0001892,Den sorte drøm,1911,Drama,"Germany, Denmark",188,6.0
tt0002101,Cleopatra,1912,"Drama, History",USA,446,5.3
tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Italy,2237,6.9
...,...,...,...,...,...,...
tt9908390,Le lion,2020,Comedy,"France, Belgium",398,5.5
tt9911196,De Beentjes van Sint-Hildegard,2020,"Comedy, Drama",Netherlands,724,7.9
tt9911774,Padmavyuhathile Abhimanyu,2019,Drama,India,265,7.8
tt9914286,Sokagin Çocuklari,2019,"Drama, Family",Turkey,194,9.4
