# LANGUAGES.CSV CLEANING

In [1]:
from traceback import print_tb

import pandas as pd


In [2]:
data_directory = 'data_csv/'

languages_df = pd.read_csv(data_directory + 'languages.csv')

# there is a spacial character we need to remove
languages_df = languages_df.replace('\xa0', ' ', regex=True)
languages_df

Unnamed: 0,id,type,language
0,1000001,Language,English
1,1000002,Primary language,Korean
2,1000002,Spoken language,English
3,1000002,Spoken language,German
4,1000002,Spoken language,Korean
...,...,...,...
1038757,1941593,Language,Chinese
1038758,1941594,Language,English
1038759,1941595,Language,English
1038760,1941596,Language,Chinese


In [3]:
languages_df.dtypes

id           int64
type        object
language    object
dtype: object

## CHANGING TYPES OF OBJECT COLUMNS

In [4]:
languages_df['type'] = languages_df['type'].astype('string')
languages_df['language'] = languages_df['language'].astype('string')
languages_df.dtypes

id                   int64
type        string[python]
language    string[python]
dtype: object

## NOW WE TRY TO UNDERSTAND COLUMNS VALUES

In [5]:
list_of_languages = list(languages_df['language'].unique())
list_of_languages.sort()
print(list_of_languages)

['Abkhaz', 'Afar', 'Afrikaans', 'Akan', 'Albanian', 'Amharic', 'Arabic', 'Aragonese', 'Armenian', 'Assamese', 'Avaric', 'Avestan', 'Aymara', 'Azerbaijani', 'Bambara', 'Bashkir', 'Basque', 'Belarusian', 'Bengali, Bangla', 'Bislama', 'Bosnian', 'Breton', 'Bulgarian', 'Burmese', 'Cantonese', 'Catalan', 'Chamorro', 'Chechen', 'Chichewa, Chewa, Nyanja', 'Chinese', 'Chuvash', 'Cornish', 'Corsican', 'Cree', 'Croatian', 'Czech', 'Danish', 'Divehi, Dhivehi, Maldivian', 'Dutch', 'Dzongkha', 'Eastern Punjabi, Eastern Panjabi', 'English', 'Esperanto', 'Estonian', 'Ewe', 'Faroese', 'Fijian', 'Finnish', 'French', 'Fula, Fulah, Pulaar, Pular', 'Galician', 'Ganda', 'Georgian', 'German', 'Gibberish', 'Greek (modern)', 'Guaraní', 'Gujarati', 'Haitian, Haitian Creole', 'Hausa', 'Hebrew (modern)', 'Herero', 'Hindi', 'Hiri Motu', 'Hungarian', 'Icelandic', 'Ido', 'Igbo', 'Indonesian', 'Interlingua', 'Interlingue', 'Inuktitut', 'Inupiaq', 'Irish', 'Italian', 'Japanese', 'Javanese', 'Kalaallisut, Greenlandic'

how many languages are there in the dataset?

In [6]:
print("There are {} languages".format(len(languages_df['language'].unique())))

There are 192 languages


In [7]:
list_of_types = list(languages_df['type'].unique())
print(list_of_types)

['Language', 'Primary language', 'Spoken language']


In [8]:
print(list(languages_df.columns.values))

['id', 'type', 'language']


In [9]:
languages_df.rename(columns={'id': 'movie_id'}, inplace=True)
print(list(languages_df.columns.values))

['movie_id', 'type', 'language']


## ARE THERE NA VALUES IN THE DATASET?

In [10]:
not_available_movies = languages_df['movie_id'].isna().sum()
not_available_types = languages_df['type'].isna().sum()
not_available_languages = languages_df['language'].isna().sum()

print('Not available ids: ', not_available_movies,
      '\nNot available types: ', not_available_types,
      '\nNot available languages: ', not_available_languages)

Not available ids:  0 
Not available types:  0 
Not available languages:  0


In [11]:
languages_df.to_csv('data_cleaned/languages_cleaned.csv', index=False)

In [12]:
languages_df = None