In [2]:
import os
import json
import pandas as pd
import numpy as np
from ast import literal_eval

from dateutil.tz import tz

## Basic functions to manipulate a dataframe

In [30]:
def get_files_in_directory():
    """
    Sets a path to JSON file.
    :returns a file name in a set path
    """
    path_to_files = '../data/input'
    files_in_path = os.scandir(path_to_files)
    
    list_of_files = []
    for file in files_in_path:
        if file.is_dir() or file.is_file():
            list_of_files.append(file.name)
            return list_of_files    # <- don't forget to align this statement back with for!!!!!

In [92]:
def create_dataframe(file_json):
    """
    Creates a pandas dataframe from JSON file.
    Sets the maximum available columns to be shown.
    Requires name of the file.
    """
    path_to_files = '../data/input/'
    with open(path_to_files + file_json) as jfile:
        json_data = json.load(jfile)
        df = pd.DataFrame(pd.json_normalize(json_data))
        pd.set_option('display.max_columns', None)
    return df

In [40]:
# FOR LATER!!!

def change_column_names(dataframe):
    """
    Changes the column names of the dataframe to their new column names.
    :param dataframe: a pandas dataframe to change column names
    :return: dataframe with new column names
    """
    new_names = {"dt": "date", 
             "id": "country_id", 
             "cord.lon": "longitude",
             "cord.lat": "latitude",
             "main.temp": "main_temp",
             "main.feels_like": "main_feels_like",
             "main.temp_min": "main_temp_min",
             "main.temp_max": "main_temp_max",
             "main.pressure": "pressure",
             "main.humidity": "humidity",
             "wind.speed": "wind_speed",
             "wind.deg": "wind_deg",
             "clouds.all": "clouds",
             "sys.type": "sys_type",
             "sys.id": "sys_id",
             "sys.country": "country",
             "sys.sunrise": "sunrise",
             "sys.sunset": "sunset"}
    dataframe.replace(new_names, inplace=True)
    return dataframe

In [34]:
# to flatten 'weather' column
# pd.json_normalize(data,record_path=['weather'])


<class 'pandas.core.series.Series'>


In [7]:
def inspect_dataframe(dataframe):
    """
    Returns information about the dataset 
    and shows a preview of the data in it.
    """
    print(dataframe.info(), '\n')
    print(dataframe.head())

In [18]:
def list_like_columns(dataframe):
    """
    Creating a list where columns have list-like values.
    """
    list_like_cols = []

    for row in dataframe:
        col_name = row
        list_like = dataframe[dataframe.astype(str)[row] == '[]'].index
        if len(list_like) != 0:
            list_like_cols.append(col_name)
    
    return list_like_cols

In [12]:
def to_proper_list(dataframe, list_like_columns):
    """
    Converting values of list-like columns to proper list type.
    """
    for row in list_like_columns:
        dataframe[row] = dataframe[row].apply(literal_eval)
    
    return dataframe

In [9]:
def copy_and_explode(dataframe, *columns):
    """
    Normalizing to the 2NF.
    Creating new dataframes containing str-list columns.
    Exploding list values of columns.
    Recreating the index column if the original dataset had one
    to keep unique index in the dataframe.
    Saving new dataframe to a CSV file.
    """
    df_copy = dataframe.copy()
    col_list = [*columns]
    

    for col in col_list:
        if isinstance(col_list, (list, pd.core.series.Series, np.ndarray)):
            df_copy = df_copy.explode(col)
            df_copy[col].replace(',', '', regex=True, inplace=True)
            df_copy = df_copy.explode(col)
        else:
            df_copy
    
    df_copy = df_copy.drop_duplicates()
       
    new_df_name = input('Enter a name for a normalized dataset: ')
    df_copy.to_csv(f'../data/output/{new_df_name}_2NF.csv', sep=',', encoding='utf-8', index=None, header='true')

## Finding the longest value in each column of a given dataframe

In [101]:
def longest_value(dataframe):
    """
    Checking for the longest string in each column of a given CSV file.
    """
    for row in dataframe:
        col_name = row
        row_pos = dataframe[row].astype(str).str.len().idxmax(axis=1)

        if dataframe[row].dtype == float or int:
            list_len = dataframe[row].astype(str).str.len().max()

        else:
            list_len = max(list(map(len, dataframe[row].values)))

        print('Column:', col_name)
        print('Value length:', list_len,)
        print('Row position:', row_pos, '\n')

## Files in ../data/input location

In [95]:
cities = get_files_in_directory()
print(cities)

['Istanbul, Turkey_response.json']


In [96]:
for city in cities:
    city_df = create_dataframe(city)
city_df.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,main.temp,main.feels_like,main.temp_min,main.temp_max,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1707141172,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,1707109898,1707146679


In [97]:
# Changing datetime format
city_df['dt'] = pd.to_datetime(city_df['dt'], unit='s', errors='coerce')
city_df['sys.sunrise'] = pd.to_datetime(city_df['sys.sunrise'], unit='s', errors='coerce')
city_df['sys.sunset'] = pd.to_datetime(city_df['sys.sunset'], unit='s', errors='coerce')
city_df.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,main.temp,main.feels_like,main.temp_min,main.temp_max,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,2024-02-05 13:52:52,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,2024-02-05 05:11:38,2024-02-05 15:24:39


In [102]:
city_df_val_len = longest_value(city_df)

ValueError: No axis named 1 for object type Series

In [98]:
city_df2 = change_column_names(city_df)
city_df2.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,main.temp,main.feels_like,main.temp_min,main.temp_max,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,2024-02-05 13:52:52,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,2024-02-05 05:11:38,2024-02-05 15:24:39


In [99]:
city_inspect = inspect_dataframe(city_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   weather          1 non-null      object        
 1   base             1 non-null      object        
 2   visibility       1 non-null      int64         
 3   dt               1 non-null      datetime64[ns]
 4   timezone         1 non-null      int64         
 5   id               1 non-null      int64         
 6   name             1 non-null      object        
 7   cod              1 non-null      int64         
 8   coord.lon        1 non-null      float64       
 9   coord.lat        1 non-null      float64       
 10  main.temp        1 non-null      float64       
 11  main.feels_like  1 non-null      float64       
 12  main.temp_min    1 non-null      float64       
 13  main.temp_max    1 non-null      float64       
 14  main.pressure    1 non-null      int64        