In [3]:
import os
import json
import pandas as pd
import numpy as np
from ast import literal_eval

from dateutil.tz import tz
from sqlalchemy import engine

## Main functions to manipulate a dataframe

### For testing purposes of data manipulation functions

In [4]:
def get_files_in_directory() -> list:
    """
    Sets a path to JSON file.
    :returns a file name in a set path
    """
    path_to_files = '../data/input'
    files_in_path = os.scandir(path_to_files)
    
    list_of_files = []
    for file in files_in_path:
        if file.is_dir() or file.is_file():
            list_of_files.append(file.name)
            return list_of_files    # <- don't forget to align this statement back with for!!!!!

In [29]:
def create_dataframe(file_json: str) -> pd.DataFrame:
    """
    Creates a pandas dataframe from JSON file.
    Sets the maximum available columns to be shown.
    Requires name of the file.
    """
    path_to_files = '../data/input/'
    with open(path_to_files + file_json) as jfile:
        json_data = json.load(jfile)
        df = pd.DataFrame(pd.json_normalize(json_data))
        pd.set_option('display.max_columns', None)
    return df

In [ ]:
# FLATTEN JSON FILE!!!
def flatten_json_file(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Flattens the supplied dataframe and returns a new dataframe with flattened json data.
    :param dataframe: dataframe to flatten
    :return: new dataframe with flattened json data
    """
    
    return dataframe

In [27]:
def change_column_names(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Changes the column names of the dataframe to their new column names.
    :param dataframe: a pandas dataframe to change column names
    :return: dataframe with new column names
    """
    new_names = {"dt": "date", 
             "id": "country_id", 
             "coord.lon": "longitude",
             "coord.lat": "latitude",
             "main.temp": "main_temp",
             "main.feels_like": "main_feels_like",
             "main.temp_min": "main_temp_min",
             "main.temp_max": "main_temp_max",
             "main.pressure": "pressure",
             "main.humidity": "humidity",
             "wind.speed": "wind_speed",
             "wind.deg": "wind_deg",
             "clouds.all": "clouds",
             "sys.type": "sys_type",
             "sys.id": "sys_id",
             "sys.country": "country",
             "sys.sunrise": "sunrise",
             "sys.sunset": "sunset"}
    dataframe.rename(columns=new_names, inplace=True)
    return dataframe

In [34]:
# to flatten 'weather' column
# pd.json_normalize(data,record_path=['weather'])


<class 'pandas.core.series.Series'>


In [35]:
def datetime_format_change(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Function to format the datetime values of the dataframe according to the ISO 8601 format.
    :param dataframe: dataframe to format datetime values.
    :return: dataframe with datetime values formatted according to the ISO 8601 format.
    """
    
    return dataframe

In [6]:
def inspect_dataframe(dataframe: pd.DataFrame):
    """
    Returns information about the dataset 
    and shows a preview of the data in it.
    """
    print(dataframe.info(), '\n')
    print(dataframe.head())

In [ ]:
def load_to_database(dataframe: pd.DataFrame, table_name: str, engine) -> None:
    """
    Function to load the data of a dataframe to a specified table in the database.
    :param dataframe: dataframe to load data from.
    :param table_name: table to load the data to.
    :return: None
    """
    dataframe.to_sql(table_name, engine, if_exists='append')

In [ ]:
def change_dataframe_col_pos(dataframe: pd.DataFrame) -> pd.DataFrame:
    
    return dataframe

In [18]:
def list_like_columns(dataframe):
    """
    Creating a list where columns have list-like values.
    """
    list_like_cols = []

    for row in dataframe:
        col_name = row
        list_like = dataframe[dataframe.astype(str)[row] == '[]'].index
        if len(list_like) != 0:
            list_like_cols.append(col_name)
    
    return list_like_cols

In [12]:
def to_proper_list(dataframe, list_like_columns):
    """
    Converting values of list-like columns to proper list type.
    """
    for row in list_like_columns:
        dataframe[row] = dataframe[row].apply(literal_eval)
    
    return dataframe

In [9]:
def copy_and_explode(dataframe, *columns):
    """
    Normalizing to the 2NF.
    Creating new dataframes containing str-list columns.
    Exploding list values of columns.
    Recreating the index column if the original dataset had one
    to keep unique index in the dataframe.
    Saving new dataframe to a CSV file.
    """
    df_copy = dataframe.copy()
    col_list = [*columns]
    

    for col in col_list:
        if isinstance(col_list, (list, pd.core.series.Series, np.ndarray)):
            df_copy = df_copy.explode(col)
            df_copy[col].replace(',', '', regex=True, inplace=True)
            df_copy = df_copy.explode(col)
        else:
            df_copy
    
    df_copy = df_copy.drop_duplicates()
       
    new_df_name = input('Enter a name for a normalized dataset: ')
    df_copy.to_csv(f'../data/output/{new_df_name}_2NF.csv', sep=',', encoding='utf-8', index=None, header='true')

## Finding the longest value in each column of a given dataframe

In [31]:
def longest_value(dataframe: pd.DataFrame):
    """
    Checking for the longest value in each column of a given dataframe.
    """
    for row in dataframe:
        col_name = row

        if dataframe[row].dtype == float or int:
            list_len = dataframe[row].astype(str).str.len().max()

        else:
            list_len = max(list(map(len, dataframe[row].values)))

        print('Column:', col_name)
        print('Value length:', list_len, '\n')

## Files in ../data/input location

In [33]:
cities = get_files_in_directory()
print(cities)

['Istanbul, Turkey_response.json']


In [34]:
for city in cities:
    city_df = create_dataframe(city)
city_df.head()

Unnamed: 0,weather,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,main.temp,main.feels_like,main.temp_min,main.temp_max,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset
0,"[{'id': 800, 'main': 'Clear', 'description': '...",stations,10000,1707141172,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,1707109898,1707146679


In [35]:
# FLATTENING THE JSON FILE
city_df_weather_flat = city_df['weather'].apply(pd.Series)
city_df_weather_flat_0 = city_df_weather_flat[0].apply(pd.Series)

city_df_weather_flat_0.columns = ['weather_id', 'weather_main', 'weather_description', 'weather_icon']

city_df_new = pd.concat([city_df, city_df_weather_flat_0], axis=1)

city_df_new = city_df_new.drop(columns=['weather'], axis=1)

city_df_new.head()

Unnamed: 0,base,visibility,dt,timezone,id,name,cod,coord.lon,coord.lat,main.temp,main.feels_like,main.temp_min,main.temp_max,main.pressure,main.humidity,wind.speed,wind.deg,clouds.all,sys.type,sys.id,sys.country,sys.sunrise,sys.sunset,weather_id,weather_main,weather_description,weather_icon
0,stations,10000,1707141172,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,1707109898,1707146679,800,Clear,clear sky,01d


In [36]:
city_df2 = change_column_names(city_df_new)
city_df2.head()

Unnamed: 0,base,visibility,date,timezone,country_id,name,cod,longitude,latitude,main_temp,main_feels_like,main_temp_min,main_temp_max,pressure,humidity,wind_speed,wind_deg,clouds,sys_type,sys_id,country,sunrise,sunset,weather_id,weather_main,weather_description,weather_icon
0,stations,10000,1707141172,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,1707109898,1707146679,800,Clear,clear sky,01d


In [37]:
# Changing datetime format
city_df_new['date'] = pd.to_datetime(city_df_new['date'], unit='s', errors='coerce')
city_df_new['sunrise'] = pd.to_datetime(city_df_new['sunrise'], unit='s', errors='coerce')
city_df_new['sunset'] = pd.to_datetime(city_df_new['sunset'], unit='s', errors='coerce')
city_df_new.head()

Unnamed: 0,base,visibility,date,timezone,country_id,name,cod,longitude,latitude,main_temp,main_feels_like,main_temp_min,main_temp_max,pressure,humidity,wind_speed,wind_deg,clouds,sys_type,sys_id,country,sunrise,sunset,weather_id,weather_main,weather_description,weather_icon
0,stations,10000,2024-02-05 13:52:52,10800,745044,Istanbul,200,28.9497,41.0138,13.04,12.38,13.04,14.45,1015,76,4.12,240,0,1,6970,TR,2024-02-05 05:11:38,2024-02-05 15:24:39,800,Clear,clear sky,01d


In [38]:
city_df_val_len = longest_value(city_df_new)

Column: base
Value length: 8 

Column: visibility
Value length: 5 

Column: date
Value length: 19 

Column: timezone
Value length: 5 

Column: country_id
Value length: 6 

Column: name
Value length: 8 

Column: cod
Value length: 3 

Column: longitude
Value length: 7 

Column: latitude
Value length: 7 

Column: main_temp
Value length: 5 

Column: main_feels_like
Value length: 5 

Column: main_temp_min
Value length: 5 

Column: main_temp_max
Value length: 5 

Column: pressure
Value length: 4 

Column: humidity
Value length: 2 

Column: wind_speed
Value length: 4 

Column: wind_deg
Value length: 3 

Column: clouds
Value length: 1 

Column: sys_type
Value length: 1 

Column: sys_id
Value length: 4 

Column: country
Value length: 2 

Column: sunrise
Value length: 19 

Column: sunset
Value length: 19 

Column: weather_id
Value length: 3 

Column: weather_main
Value length: 5 

Column: weather_description
Value length: 9 

Column: weather_icon
Value length: 3 


In [39]:
city_inspect = inspect_dataframe(city_df_new)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   base                 1 non-null      object        
 1   visibility           1 non-null      int64         
 2   date                 1 non-null      datetime64[ns]
 3   timezone             1 non-null      int64         
 4   country_id           1 non-null      int64         
 5   name                 1 non-null      object        
 6   cod                  1 non-null      int64         
 7   longitude            1 non-null      float64       
 8   latitude             1 non-null      float64       
 9   main_temp            1 non-null      float64       
 10  main_feels_like      1 non-null      float64       
 11  main_temp_min        1 non-null      float64       
 12  main_temp_max        1 non-null      float64       
 13  pressure             1 non-null      in