# THEMES.CSV CLEANING

In [1]:
import pandas as pd

In [2]:
data_directory = 'data_csv/'

themes_df = pd.read_csv(data_directory + 'themes.csv', quotechar='"')
themes_df

Unnamed: 0,id,theme
0,1000001,Humanity and the world around us
1,1000001,Crude humor and satire
2,1000001,Moving relationship stories
3,1000001,Emotional and captivating fantasy storytelling
4,1000001,Surreal and thought-provoking visions of life ...
...,...,...
125636,1835643,Noir and dark crime dramas
125637,1835643,Intriguing and suspenseful murder mysteries
125638,1849827,Faith and religion
125639,1849827,Faith and spiritual journeys


In [3]:
themes_df.shape

(125641, 2)

In [4]:
themes_df.dtypes

id        int64
theme    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [5]:
themes_df['theme'] = themes_df['theme'].astype('string')
themes_df.dtypes

id                int64
theme    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [6]:
print(list(themes_df.columns.values))

['id', 'theme']


In [7]:
themes_df.rename(columns={'id': 'movie_id'}, inplace=True)
print(list(themes_df.columns.values))

['movie_id', 'theme']


In [8]:
print(list(themes_df['theme'].unique()))

['Humanity and the world around us', 'Crude humor and satire', 'Moving relationship stories', 'Emotional and captivating fantasy storytelling', 'Surreal and thought-provoking visions of life and death', 'Quirky and endearing relationships', 'Amusing jokes and witty satire', 'Laugh-out-loud relationship entanglements', 'Intense violence and sexual transgression', 'Twisted dark psychological thriller', 'Heartbreaking and moving family drama', 'Enduring stories of family and marital drama', 'Touching and sentimental family stories', 'Intense political and terrorist thrillers', 'Powerful stories of heartbreak and suffering', 'Dreamlike, quirky, and surreal storytelling', 'Challenging or sexual themes & twists', 'Graphic violence and brutal revenge', 'Song and dance', 'Dazzling vocal performances and musicals', 'Captivating relationships and charming romance', 'Dance rhythms and catchy tunes', 'Emotional life of renowned artists', 'Charming romances and delightful chemistry', 'Politics and 

## ARE THERE NA VALUES IN THE DATASET?

In [9]:
not_available_ids = themes_df['movie_id'].isna().sum()
not_available_themes = themes_df['theme'].isna().sum()
print('not available ids: {}'.format(not_available_ids))
print('not available names: {}'.format(not_available_themes))

not available ids: 0
not available names: 0


## ARE THERE ANY DUPLICATES?

In [10]:
num_duplicated_themes = themes_df.duplicated().sum()
print('There are a total of {} duplicates.'.format(num_duplicated_themes))

There are a total of 0 duplicates.


## SAVING THE CLEANED DATASET

In [11]:
themes_df.to_csv('data_cleaned/themes_cleaned.csv', index=False)

In [12]:
themes_df = None