# GENRES.CSV CLEANING

In [1]:
import pandas as pd

In [2]:
data_directory = 'data_csv/'

genres_df = pd.read_csv(data_directory + 'genres.csv')
genres_df

Unnamed: 0,id,genre
0,1000001,Comedy
1,1000001,Adventure
2,1000002,Comedy
3,1000002,Thriller
4,1000002,Drama
...,...,...
1046844,1941563,Drama
1046845,1941566,Crime
1046846,1941569,Crime
1046847,1941596,Action


In [3]:
genres_df.dtypes

id        int64
genre    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [4]:
genres_df['genre'] = genres_df['genre'].astype('string')
genres_df.dtypes

id                int64
genre    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMN VALUES

In [5]:
list_of_genres = list(genres_df['genre'].unique())
print(list_of_genres)

['Comedy', 'Adventure', 'Thriller', 'Drama', 'Science Fiction', 'Action', 'Music', 'Romance', 'History', 'Crime', 'Animation', 'Mystery', 'Horror', 'Family', 'Fantasy', 'War', 'Western', 'TV Movie', 'Documentary']


In [6]:
genres_df.rename(columns={'id': 'movie_id'}, inplace=True)
print(list(genres_df.columns.values))

['movie_id', 'genre']


## ARE THERE NA VALUES IN THE DATASET?

In [7]:
not_available_movie_ids = genres_df['movie_id'].isna().sum()
not_available_genres = genres_df['genre'].isna().sum()
print('Not available movie ids: {}'.format(not_available_movie_ids),
      '\nNot available genres: {}'.format(not_available_genres))

Not available movie ids: 0 
Not available genres: 0


## ARE THERE ANY DUPLICATES?

In [8]:
num_duplicated_genres = genres_df.duplicated().sum()
print('There are a total of {} duplicates '.format(num_duplicated_genres))

There are a total of 0 duplicates 


In [9]:
genres_df.to_csv('data_cleaned/genres_cleaned.csv', index=False)

In [10]:
genres_df = None