In [None]:
import os
import pandas as pd
import numpy as np
from ast import literal_eval

## Basic functions to manipulate a dataframe

In [None]:
def path_to_files():
    """
    Printing out the files in a directory of choice for convenience.
    """
    path = input('Enter a path to the files: ')
    files_path = os.scandir(path)

    for file in files_path:
        if file.is_dir() or file.is_file():
            print(file.name)

In [None]:
def open_dataframe(file_path, file_name):
    """
    Opens a CSV file and sets the maximum available columns to be shown.
    Requires path to file and name of the file.
    Returns a dataframe of the file.
    """
    df = pd.read_csv(file_path + '/' + file_name)
    pd.set_option('display.max_columns', None)
    return df

In [None]:
def inspect_dataframe(dataframe):
    '''
    Returns information about the dataset 
    and shows a preview of the data in it.
    '''
    print(dataframe.info(), '\n')
    print(dataframe.head())

In [None]:
def list_like_columns(dataframe):
    '''
    Creating a list where columns have list-like values.
    '''
    list_like_cols = []

    for row in dataframe:
        col_name = row
        empty_list = dataframe[dataframe.astype(str)[row] == '[]'].index
        if len(empty_list) != 0:
            list_like_cols.append(col_name)
    
    return list_like_cols

In [None]:
def to_proper_list(dataframe, list_like_columns):
    '''
    Converting values of list-like columns to proper list type.
    '''
    for row in list_like_columns:
        dataframe[row] = dataframe[row].apply(literal_eval)
    
    return dataframe

In [None]:
def copy_and_explode(dataframe, *columns):
    '''
    Normalizing to the 2NF.
    Creating new dataframes containing str-list columns.
    Exploding list values of columns.
    Recreating the index column if the original dataset had one
    to keep unique index in the dataframe.
    Saving new dataframe to a CSV file.
    '''
    df_copy = dataframe.copy()
    col_list = [*columns]
    

    for col in col_list:
        if isinstance(col_list, (list, pd.core.series.Series, np.ndarray)):
            df_copy = df_copy.explode(col)
            df_copy[col].replace(',', '', regex=True, inplace=True)
            df_copy = df_copy.explode(col)
        else:
            df_copy
    
    df_copy = df_copy.drop_duplicates()
       
    new_df_name = input('Enter a name for a normalized dataset: ')
    df_copy.to_csv(f'../data/output/{new_df_name}_2NF.csv', sep=',', encoding='utf-8', index=None, header='true')

In [None]:
def save_to_csv(dataframe):
    '''
    Save the dataframes as CSV files.
    Requires path to save location
    and name of the file to save as.
    '''
    path_to_save = input('Path to save location: ')
    name_of_file = input('File name: ')
    dataframe.to_csv(f'{path_to_save}/{name_of_file}.csv', sep=',', encoding='utf-8', index=None, header='true')

## Finding the longest value in each column of a given dataframe

In [None]:
def longest_value(dataframe):
    '''
    Checking for the longest string in each column of a given CSV file.
    '''
    for row in dataframe:
        col_name = row
        row_pos = dataframe[row].astype(str).str.len().idxmax(axis=1)

        if dataframe[row].dtype == float or int:
            list_len = dataframe[row].astype(str).str.len().max()

        else:
            list_len = max(list(map(len, dataframe[row].values)))

        print('Column:', col_name)
        print('Value length:', list_len,)
        print('Row possition:', row_pos, '\n')

## Files in ../data/input location

In [None]:
input_files = path_to_files()

## DF1: Best Movie by Year Netflix.csv

In [None]:
df = open_dataframe('../data/input', 'Best Movie by Year Netflix.csv')

In [None]:
df_info = inspect_dataframe(df)

In [None]:
# Checking if there are duplicate rows

df.duplicated().value_counts()

In [None]:
# Dropping duplicate rows

df = df.drop_duplicates()

In [None]:
df['MAIN_PRODUCTION'].values

In [None]:
# Finding the longest value in every column.

df_val_len = longest_value(df)

In [None]:
# Saving to the '../data/output' folder

df_csv = save_to_csv(df)

## DF2: Best Movies Netflix.csv

In [None]:
df2 = open_dataframe('../data/input', 'Best Movies Netflix.csv')

In [None]:
df2_info = inspect_dataframe(df2)

In [None]:
df2.duplicated().value_counts()

In [None]:
df2['MAIN_PRODUCTION'].unique()

In [None]:
df2_val_len = longest_value(df2)

In [None]:
df2_csv = save_to_csv(df2)

## DF3: Best Show by Year Netflix.csv

In [None]:
df3 = open_dataframe('../data/input', 'Best Show by Year Netflix.csv')

In [None]:
df3_info = inspect_dataframe(df3)

In [None]:
df3.duplicated().value_counts()

In [None]:
df3['MAIN_PRODUCTION'].values

In [None]:
df3_val_len = longest_value(df3)

In [None]:
df3_csv = save_to_csv(df3)

## DF4: Best Shows Netflix.csv

In [None]:
df4 = open_dataframe('../data/input', 'Best Shows Netflix.csv')

In [None]:
df4_info = inspect_dataframe(df4)

In [None]:
df4.duplicated().value_counts()

In [None]:
df4['MAIN_PRODUCTION'].values

In [None]:
df4_val_len = longest_value(df4)

In [None]:
df4_csv = save_to_csv(df4)

## DF5: raw_credits.csv

In [None]:
df5 = open_dataframe('../data/input', 'raw_credits.csv')

In [None]:
df5_info = inspect_dataframe(df5)

In [None]:
df5.duplicated().value_counts()

In [None]:
df5['character'].value_counts()

In [None]:
df5['character'] = df5['character'].str.split("/")
df5 = df5.explode('character')

In [None]:
df5['character'].value_counts()

In [None]:
df5 = df5.reset_index()
df5.head()

In [None]:
df5['index'].value_counts()

In [None]:
# Dropping 'index' column and creating a new one after data normalization
# Moving the 'index' column to the front of the dataframe

df5.drop('index', axis=1, inplace=True)

df5['index'] = df5.index
df5.insert(0, 'index', df5.pop('index'))

In [None]:
df5.head()

In [None]:
df5['index'].value_counts()

In [None]:
df5.drop('level_0', axis=1, inplace=True)

In [None]:
df5.info()

In [None]:
df5_val_len = longest_value(df5)

In [None]:
# Saving the dataframe.

df5_csv = save_to_csv(df5)

## DF6: raw_titles.csv

In [None]:
df6 = open_dataframe('../data/input', 'raw_titles.csv')

In [None]:
df6_info = inspect_dataframe(df6)

In [None]:
df6.columns

In [None]:
df6.duplicated().value_counts()

In [None]:
df6['production_countries'].unique()

In [None]:
# A list of columns where values are list-like values.

df6_cols = list_like_columns(df6)
print(df6_cols)

In [None]:
# Changing the list-like values to proper list values.

df6_list = to_proper_list(df6, df6_cols)

In [None]:
df6_list.head()

In [None]:
# Normalizing the dataframe to the 2NF.

df6_2nf = copy_and_explode(df6_list, 'genres', 'production_countries')

In [None]:
df6_2 = open_dataframe('../data/output', 'raw_titles_2nf.csv')

In [None]:
df6_2.head()

In [None]:
df6_2['genres'].value_counts()

In [None]:
df6_2[df6_2.loc[:, 'genres'] == 'documentation']

In [None]:
# Replacing 'documentation' to 'documentary'

df6_2 = df6_2.replace({'documentation': 'documentary'})
df6_2[df6_2.loc[:, 'genres'] == 'documentary']

In [None]:
df6_2[df6_2.loc[:, 'production_countries'] == 'Lebanon']

In [None]:
# Replacing 'Lebanon' to 'documentary'

df6_2 = df6_2.replace({'Lebanon': 'LB'})
df6_2[df6_2.loc[:, 'production_countries'] == 'LB']

In [None]:
# Changing the index of a dataset.

df6_2.drop('index', axis=1, inplace=True)

df6_2['index'] = df6_2.index
df6_2.insert(0, 'index', df6_2.pop('index'))

In [None]:
df6_2.head()

In [None]:
df6_2['index'].value_counts()

In [None]:
df6_val_len = longest_value(df6_2)

In [None]:
# Saving the dataframe.

df6_csv = save_to_csv(df6_2)

## Files in ../data/output location

In [None]:
output_files = path_to_files()