In [2]:
import os
import sqlite3
import psycopg2
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import boxcox
import matplotlib.pyplot as plt
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine, inspect, MetaData, Table

EXTRACT & LOAD RAW DATA TO DATABASE

In [3]:
def extract_data(folder_path):
    """
    Extract CSV data from the specified folder.
    
    arg:
    folder_path: Path to the folder containing CSV files.
    return: data_frames: list of pandas dataframes containing data from each of the csv files.
    
    """
    
    data_frames = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df['file_name'] = file_name # add a column to store the file name           
            data_frames.append(df)
            
    return data_frames

In [4]:
folder_path = r"C:\Users\admin\Desktop\Projects\Project FitBit\Fitabase Dataset"
data_frames = extract_data(folder_path)

In [5]:
USERNAME = 'fitbase'
PASSWORD = 'fitbase'
SERVER = 'localhost'
DATABASE = 'projectfitbase'

In [None]:
# Define your PostgreSQL database connection string
conn_str = f'postgresql://{USERNAME}:{PASSWORD}@{SERVER}/{DATABASE}'

# Establish a connection to the PostgreSQL database
engine = create_engine(conn_str)

Inspector = inspect(engine)

# Create a sessionmaker to handle transactions
Session = sessionmaker(bind=engine)

# Loop through each DataFrame in data_frames list
for df in data_frames:
    # Extract the file name from the DataFrame
    table_name = df['file_name'].iloc[0].split('.')[0]

    # Begin a transaction
    with Session.begin() as session:
        # Check if the table already exists in the database
        table_exists = Inspector.has_table(table_name)

        if not table_exists:
            # If the table does not exist, create it
            df.head(0).to_sql(table_name, engine, index=False)  # Create empty table structure

        # Load the DataFrame into the database table
        df.to_sql(table_name, engine, if_exists='append', index=False)  # Append data to the table

print("ETL process completed successfully.")

EXTRACT, TRANSFORM & LOAD FROM DATABASE TO WAREHOUSE

# Transformation

<b>1. Handle Duplicates:</b>

Are there duplicate rows in the dataset that need to be removed?

How will the removal of duplicates impact the analysis or modeling tasks?

In [6]:
def handle_duplicates(data_frames):
    """
    Remove duplicates from each DataFrame in the list.
    
    Parameters:
        data_frames (list): List of DataFrames.
        
    Returns:
        list: List of DataFrames with duplicates removed.
    """
    cleansed_data_frames = []
    for df in data_frames:
        cleansed_data_frames.append(df.drop_duplicates())
    return cleansed_data_frames

<b>2. Handle Missing Values:</b>

Are there columns with over 90% missing values and rows with 100% missing values in the dataset?

How will the removal of duplicates impact the analysis or modeling tasks?

In [7]:
def handle_missing_values(data_frames, column_threshold=0.9):
    """
    Handle missing values in a list of DataFrames by dropping columns with over 90% missing values
    and rows that are 100% null.

    Parameters:
        data_frames (list): List of pandas DataFrames.
        column_threshold (float): Threshold for dropping columns based on the proportion of missing values.
    
    Returns:
        cleaned_data_frames (list): List of DataFrames with missing values handled.
    """
    cleaned_data_frames = []

    for df in data_frames:

        # Make a copy of the DataFrame
        df = df.copy()
        
        # Drop columns with over 90% missing values
        df.dropna(axis=1, thresh=int(column_threshold * len(df)), inplace=True)
        
        # Drop rows that are 100% null
        df.dropna(axis=0, how='all', inplace=True)
        
        # Reset index
        df.reset_index(drop=True, inplace=True)

        cleaned_data_frames.append(df)

    return cleaned_data_frames

<b>3. Data Type:</b> 

What are the data types of the values? (Numeric, categorical, or other?)

If numeric, are they continuous or discrete?

In [8]:
def classify_datatypes(dataframes):
    """
    Classify columns in a list of DataFrames into different data types: numeric, datetime, date, and categorical.

    Parameters:
        dataframes (list): A list of pandas DataFrames.

    Returns:
        numeric_columns_all (list): List of columns classified as numeric across all DataFrames.
        datetime_columns_all (list): List of columns classified as datetime across all DataFrames.
        date_columns_all (list): List of columns classified as date across all DataFrames.
        categorical_columns_all (list): List of columns classified as categorical across all DataFrames.
    """
    numeric_columns_all = []
    datetime_columns_all = []
    date_columns_all = []
    categorical_columns_all = []

    for df in dataframes:
        numeric_columns, datetime_columns, date_columns, categorical_columns = [], [], [], []

        for col in df.columns:
            if df[col].dtype in ['float64', 'int64']:
                numeric_columns.append(col)
            elif df[col].astype(str).str.match(r'\d+/\d+/\d+ \d+:\d+:\d+ [AP]M').all():
                datetime_columns.append(col)
            elif df[col].astype(str).str.match(r'\d+/\d+/\d+').all():
                date_columns.append(col)
            else:
                categorical_columns.append(col)

        numeric_columns_all.extend(numeric_columns)
        datetime_columns_all.extend(datetime_columns)
        date_columns_all.extend(date_columns)
        categorical_columns_all.extend(categorical_columns)

    return numeric_columns_all, datetime_columns_all, date_columns_all, categorical_columns_all, data_frames

<b>4. Data Context:</b>

What do the values represent? (Measurements, counts, percentages, etc.)

What is the context or domain of the data?

In [123]:
def classify_numeric_columns(data_frames, numeric_columns_all, threshold=0.06):
    """
    Classify numeric columns in a list of DataFrames into count columns and measurement columns.
    
    Parameters:
        data_frames (list): List of DataFrames.
        numeric_columns_all (list): List of all numeric columns across all DataFrames.
        threshold (float): Threshold to differentiate count columns from measurement columns based on the proportion of unique values.
                           Columns with a proportion of unique values less than or equal to the threshold are classified as count columns.
    
    Returns:
        count_columns_all (list): List of all count columns across all DataFrames.
        measurement_columns_all (list): List of all measurement columns across all DataFrames.
    """

    count_columns_all = []
    measurement_columns_all = []

    for df in data_frames:
        count_columns, measurement_columns = [], []

        for col in numeric_columns_all:
            if col in df.columns:  # Check if column exists in DataFrame
                unique_ratio = df[col].nunique() / len(df[col])
                print(f'unique_ratio: {unique_ratio} for {col}')
                
                if unique_ratio <= 0.06:
                    count_columns.append(col)
                    print(f'{col} from unique_ratio')
                    print(f'{count_columns} from count_columns')
                
                elif col.endswith('Id'):
                    count_columns.append(col)
                    print(f'{col} from endswith Id')
                    print(f'{count_columns} from count_columns')

                else:
                    measurement_columns.append(col)
                         
    count_columns_all.extend(count_columns)
    measurement_columns_all.extend(measurement_columns)

    return count_columns_all, measurement_columns_all, data_frames


In [124]:
#chain the functions

# 1. clean duplicates
cleansed_data_frames_duplicates = handle_duplicates(data_frames)

# 2. clean missing values using the cleaned data frames from duplicates
cleansed_data_frames_missing_values = handle_missing_values(cleansed_data_frames_duplicates)

# 3. Unpack the returned tuple and classify datatypes using cleansed data_frames from missing values
numeric_columns_all, datetime_columns_all, date_columns_all, categorical_columns_all, cleansed_data_frames_datatypes = classify_datatypes(cleansed_data_frames_missing_values)

# 4. Categorize numeric columns into count and measurements using unpacked cleansed data_frames from classify_datatypes function
count_columns_all, measurement_columns_all, cleansed_data_frames_classify_numbers = classify_numeric_columns(cleansed_data_frames_datatypes, numeric_columns_all)

unique_ratio: 0.035106382978723406 for Id
Id from unique_ratio
['Id'] from count_columns
unique_ratio: 0.8957446808510638 for TotalSteps
unique_ratio: 0.6542553191489362 for TotalDistance
unique_ratio: 0.652127659574468 for TrackerDistance
unique_ratio: 0.02021276595744681 for LoggedActivitiesDistance
LoggedActivitiesDistance from unique_ratio
['Id', 'LoggedActivitiesDistance'] from count_columns
unique_ratio: 0.35425531914893615 for VeryActiveDistance
unique_ratio: 0.22446808510638297 for ModeratelyActiveDistance
unique_ratio: 0.5223404255319148 for LightActiveDistance
unique_ratio: 0.009574468085106383 for SedentaryActiveDistance
SedentaryActiveDistance from unique_ratio
['Id', 'LoggedActivitiesDistance', 'SedentaryActiveDistance'] from count_columns
unique_ratio: 0.12978723404255318 for VeryActiveMinutes
unique_ratio: 0.08617021276595745 for FairlyActiveMinutes
unique_ratio: 0.35638297872340424 for LightlyActiveMinutes
unique_ratio: 0.5840425531914893 for SedentaryMinutes
unique_rat

unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 5.636846941084481e-06 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Value', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_

unique_ratio: 0.0029760557642692256 for Calories
Calories from unique_ratio
['Id', 'Calories'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Calories', 'Id'] from count_columns
unique_ratio: 0.0029760557642692256 for Calories
Calories from unique_ratio
['Id', 'Calories', 'Id', 'Calories'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Calories', 'Id', 'Calories', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Calories', 'Id', 'Calories', 'Id', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Calories', 'Id', 'Calories', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Calories', 'Id', 'Calories', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0029760557642692256 for Calories
Calories from unique_ratio
['Id', 'Calories', 'Id', 

unique_ratio: 0.05765765765765766 for Calories56
Calories56 from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Calories00', 'Calories01', 'Calories02', 'Calories03', 'Calories04', 'Calories05', 'Calories06', 'Calories07', 'Calories08', 'Calories09', 'Calories10', 'Calories11', 'Calories12', 'Calories13', 'Calories14', 'Calories15', 'Calories16', 'Calories17', 'Calories18', 'Calories19', 'Calories20', 'Calories21', 'Calories22', 'Calories23', 'Calories24', 'Calories25', 'Calories26', 'Calories27', 'Calories28', 'Calories29', 'Calories30', 'Calories31', 'Calories32', 'Calories33', 'Calories34', 'Calories35', 'Calories36', 'Calories37', 'Calories38', 'Calories39', 'Calories40', 'Calories41', 'Calories42', 'Calories43', 'Calories44', 'Calories45', 'Calories46', 'Calories47', 'Calories48', 'Calories49', 'Calories50', 'Calories51', 'Calories52', 'Calories53', 'Calories54', 'Calories55', 'Calories56'] from count_columns
unique_ratio: 0.05853545853545854 for Calori

unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Intensity', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.0015246015246015245 fo

unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id'] from count_columns
unique_ratio: 0.00010259659922449041 for METs
METs from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'METs'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'METs', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'METs', 'Id', 'Id'] from count_columns
unique_ratio: 2.4894763047118996e-05 for Id
Id from unique_ratio
['Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', 'Id', '

In [125]:
total_columns = 0
for df in cleansed_data_frames_classify_numbers:
    total_columns += len(df.columns)
print(total_columns)

277


In [126]:
len(count_columns_all)

4302

In [127]:
len(measurement_columns_all)

4302

In [112]:
len(numeric_columns_all)

239

In [34]:
print(len(datetime_columns_all))
print(len(date_columns_all))
print(len(categorical_columns_all))

14
4
19


In [76]:
count_columns_all

['Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'Id',
 'LogId']

In [84]:
numeric_columns_all

['Id',
 'TotalSteps',
 'TotalDistance',
 'TrackerDistance',
 'LoggedActivitiesDistance',
 'VeryActiveDistance',
 'ModeratelyActiveDistance',
 'LightActiveDistance',
 'SedentaryActiveDistance',
 'VeryActiveMinutes',
 'FairlyActiveMinutes',
 'LightlyActiveMinutes',
 'SedentaryMinutes',
 'Calories',
 'Id',
 'Calories',
 'Id',
 'SedentaryMinutes',
 'LightlyActiveMinutes',
 'FairlyActiveMinutes',
 'VeryActiveMinutes',
 'SedentaryActiveDistance',
 'LightActiveDistance',
 'ModeratelyActiveDistance',
 'VeryActiveDistance',
 'Id',
 'StepTotal',
 'Id',
 'Value',
 'Id',
 'Calories',
 'Id',
 'TotalIntensity',
 'AverageIntensity',
 'Id',
 'StepTotal',
 'Id',
 'Calories',
 'Id',
 'Calories00',
 'Calories01',
 'Calories02',
 'Calories03',
 'Calories04',
 'Calories05',
 'Calories06',
 'Calories07',
 'Calories08',
 'Calories09',
 'Calories10',
 'Calories11',
 'Calories12',
 'Calories13',
 'Calories14',
 'Calories15',
 'Calories16',
 'Calories17',
 'Calories18',
 'Calories19',
 'Calories20',
 'Calories2

In [46]:
measurement_columns_all

['TotalSteps',
 'TotalDistance',
 'TrackerDistance',
 'LoggedActivitiesDistance',
 'VeryActiveDistance',
 'ModeratelyActiveDistance',
 'LightActiveDistance',
 'SedentaryActiveDistance',
 'VeryActiveMinutes',
 'FairlyActiveMinutes',
 'LightlyActiveMinutes',
 'SedentaryMinutes',
 'Calories',
 'Calories',
 'SedentaryMinutes',
 'LightlyActiveMinutes',
 'FairlyActiveMinutes',
 'VeryActiveMinutes',
 'SedentaryActiveDistance',
 'LightActiveDistance',
 'ModeratelyActiveDistance',
 'VeryActiveDistance',
 'StepTotal',
 'Value',
 'Calories',
 'TotalIntensity',
 'AverageIntensity',
 'StepTotal',
 'Calories',
 'Calories00',
 'Calories01',
 'Calories02',
 'Calories03',
 'Calories04',
 'Calories05',
 'Calories06',
 'Calories07',
 'Calories08',
 'Calories09',
 'Calories10',
 'Calories11',
 'Calories12',
 'Calories13',
 'Calories14',
 'Calories15',
 'Calories16',
 'Calories17',
 'Calories18',
 'Calories19',
 'Calories20',
 'Calories21',
 'Calories22',
 'Calories23',
 'Calories24',
 'Calories25',
 'Calo

<b>4. Analysis Requirements:</b>

What are the requirements of your analysis or modeling tasks?

Are there specific algorithms or analyses that require certain transformations or preprocessing steps?

<b>1. Data Distribution:</b> 
    
Are the values within a reasonable range for the context of the project?

Are there any outliers that might skew the analysis?

Is the data distribution skewed or heavily tailed?

Does the data exhibit high kurtosis?

In [None]:
# Assuming data_frames is your list of DataFrames with 'file_name' column added

for df in data_frames:
    # Exclude 'Id' column from numerical columns
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    numerical_columns = numerical_columns.drop('Id')

    # Visualize data distribution
    for col in numerical_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(data=df, x=col, kde=True)
        plt.title(f'Data Distribution for {col} ({df["file_name"].iloc[0]})')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.show()

    # Calculate summary statistics
    summary_stats = df[numerical_columns].describe()
    #print(f'Summary Statistics for {df["file_name"].iloc[0]}:\n{summary_stats}')

    # Identify outliers
    for col in numerical_columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        if not outliers.empty:
            print(f'Outliers in {col}')

            # Visualize outliers
            plt.figure(figsize=(8, 6))
            sns.boxplot(data=df[col])
            plt.title(f'Boxplot for {col} ({df["file_name"].iloc[0]})')
            plt.xlabel(col)
            plt.show()

    # Assess skewness and kurtosis
    skewness = df[numerical_columns].skew()
    kurtosis = df[numerical_columns].kurtosis()
    print(f'Skewness for {df["file_name"].iloc[0]}:\n{skewness}')
    print(f'Kurtosis for {df["file_name"].iloc[0]}:\n{kurtosis}')


Skewness:

Skewness measures the asymmetry of the distribution of values in a dataset.

*For a perfectly symmetric distribution, the skewness should be close to 0.

*Positive skewness indicates a longer right tail, so if you're aiming for symmetry, you'd want to reduce positive skewness towards 0.

*Negative skewness indicates a longer left tail, so if you're aiming for symmetry, you'd want to reduce negative skewness towards 0.


Kurtosis:

*Kurtosis measures the tailedness or peakedness of the distribution of values in a dataset.

*For a normal distribution, the kurtosis is 3.

*Kurtosis greater than 3 indicates heavier tails and a more peaked distribution (leptokurtic).

*Kurtosis less than 3 indicates lighter tails and a flatter distribution (platykurtic).

*In the output, for example, the skewness value for the StepTotal column is approximately 4.832214, indicating that the distribution is right-skewed.. The kurtosis value for the StepTotal column is approximately 34.200632, indicating a heavily tailed distribution with many values in the tails.

Based on these values, you can infer that the StepTotal column has a right-skewed distribution (positive skewness) and a heavily tailed distribution (high kurtosis). This information provides insights into the shape and characteristics of the distribution of values in the dataset.

Transformations that can be applied to correct skewness and kurtosis:

For Skewness:

*Right-skewed data (positive skewness): square root, logarithm, or reciprocal to reduce the skewness towards 0.

*Left-skewed data (negative skewness), you can apply transformations such as square or cube to reduce the skewness towards 0.

For Kurtosis:

*To reduce excessive kurtosis: square root, logarithm, or reciprocal to flatten the tails and reduce the peakiness of the distribution.

*If the distribution is too flat(low kurtosis): squaring or cubing to increase kurtosis.

<b>Transformation 1: transform skewed, kutotic and outliered datapoint</b>

In [None]:
# Loop through each DataFrame in data_frames list
for df in data_frames:
    # Check if column exists in the DataFrame
    for col in df.columns:
        if col == 'skewness' or col == 'kurtosis':
            # Apply transformations to correct skewness and kurtosis
            df[f'{col}Transformed'] = np.sqrt(df[col])  # Square root transformation for skewness
            df.drop(columns=col, inplace=True)
            
            # Check skewness and kurtosis after transformation
            skewness_after = df[f'{col}Transformed'].skew()
            kurtosis_after = df[f'{col}Transformed'].kurtosis()

            print(f'Skewness after transformation: {skewness_after}')
            print(f'Kurtosis after transformation: {kurtosis_after}')
            
            # Apply Box-Cox transformation
            if col == 'outlier':
                df[f'{col}Transformed'], lambda_value = boxcox(df[col] + 1)  # Adding 1 to handle zero values
                df.drop(columns=col, inplace=True)

<b>5. Normalization or Scaling:</b>

Do the values need to be normalized or scaled to ensure they are on a comparable scale?

<b>6. Relationships Between Variables:</b>

Are there nonlinear relationships between variables that need to be captured?

Do interaction terms or polynomial features need to be included?

<b>7. Model Assumptions:</b>

Do the statistical or machine learning models used have specific assumptions about the data distribution or relationships between variables?

Do transformations need to be applied to meet these assumptions?

<b>8. Interpretability:<b>

How will the transformations affect the interpretability of the data?

Will the transformations alter the original scale or meaning of the variables?

<b>9. Handling Zeros or Negative Values:</b>

How will zeros or negative values be handled in the data?

Are there transformations that are not applicable due to the presence of zeros or negative values?

<b>12. Data encoding:</b>
    
Do boolean values need to be encoded to 1 for True and 0 for False?

How will data encoding of boolean/or categorical values impact data interpretation and analysis?

Do you need to add comments to indicate the meaning of True (1) and False (0) in the respective columns?

I encoded the boolean values in the 'boolean_column' as integers, where True is represented as 1 and False as 0."

In [None]:
import pandas as pd

# Assuming df is your DataFrame and 'boolean_column' is the boolean column
df['boolean_column'] = df['boolean_column'].astype(int)

# Alternatively, you can use a mapping function
# df['boolean_column'] = df['boolean_column'].map({True: 1, False: 0})

<b>Datetime Decomposition:</b>

This term describes the process of breaking down a datetime variable into its constituent parts, such as date and time components.

Do datetime values need to be split into separate date and time columns?

How will splitting datetime values impact subsequent analysis or modeling tasks?

<b>Row Identification:</b>

This term describes the process of assigning a unique identifier to each row in a dataset.

Is it necessary to add a unique identifier to each row in the dataset?

How will the addition of a unique ID affect data manipulation and analysis?

"I performed row identification by adding a unique ID to each row in the DataFrame."

In [None]:
    # Initialize serial number
    serial_number = 1
    
    def generate_unique_id(file_name):
        """
        Generate a unique ID based on the file name and auto-incrementing serial number.
        
        Args:
        - file_name (str): Name of the file.
        
        Returns:
        - str: Unique ID.
        """
        nonlocal serial_number  # Use the serial_number variable from the outer function
        initials = ''.join(word[0].upper() for word in file_name.split('_'))
        unique_id = f'{initials}{serial_number}'
        serial_number += 1  # Increment serial number
        return unique_id

In [None]:
min(data_frames[6]['AverageIntensity'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming outliers DataFrame contains the outlier observations
# Assuming the column with outliers is 'Column_with_outliers'
# Replace 'Column_with_outliers' with the actual column name

# Plot a boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(data=data_frames[6]['AverageIntensity'])
plt.title('Boxplot of Column with Outliers')
plt.xlabel('Column Name')
plt.show()


In [None]:
# Plot a boxplot
plt.figure(figsize=(8, 6))
sns.histplot(data=data_frames[6]['AverageIntensity'])
plt.title('Histplot of Column with Outliers')
plt.xlabel('Column Name')
plt.show()

In [None]:
from scipy.stats import boxcox

# Apply Box-Cox transformation
transformed_data, lambda_value = boxcox(df['AverageIntensity'] + 1)  # Adding 1 to handle zero values
df['AverageIntensityTransformed'] = transformed_data

In [None]:
data_frames[6]

In [None]:
# Plot a boxplot
plt.figure(figsize=(8, 6))
sns.histplot(data=data_frames[6]['AverageIntensityTransformed'])
plt.title('Histplot of Column with Outliers')
plt.xlabel('Column Name')
plt.show()

In [None]:
# Plot a boxplot
plt.figure(figsize=(8, 6))
sns.boxplot(data=data_frames[6]['AverageIntensityTransformed'])
plt.title('Histplot of Column with Outliers')
plt.xlabel('Column Name')
plt.show()

In [None]:
data_frames[17].info()

In [None]:
data_frames[17]['Fat'].isnull().sum()/len(data_frames[17]['Fat'])*100

In [None]:
if df[col].iloc[1] like "4/12/2016 12:00:00 AM" => split to date, time and am/pm column and convert to appropriate datatype

if df[col].iloc[1] like "4/12/2016" => convert to appropriate datatype (date)

In [None]:
SOURCE_USERNAME = 'fitbase'
SOURCE_PASSWORD = 'fitbase'
SOURCE_SERVER = 'localhost'
SOURCE_DATABASE = 'projectfitbase'

DEST_USERNAME = 'fitbase'
DEST_PASSWORD = 'fitbase'
DEST_SERVER = 'localhost'
DEST_DATABASE = 'projectfitbasetransformed'

In [None]:
# Source database connection string
source_conn_str = f'postgresql://{SOURCE_USERNAME}:{SOURCE_PASSWORD}@{SOURCE_SERVER}/{SOURCE_DATABASE}'

# Destination database connection string
dest_conn_str = f'postgresql://{DEST_USERNAME}:{DEST_PASSWORD}@{DEST_SERVER}/{DEST_DATABASE}'

# Establish connections to source and destination databases
source_engine = create_engine(source_conn_str)
dest_engine = create_engine(dest_conn_str)

# Reflect metadata from source database
metadata = MetaData()
metadata.reflect(bind=source_engine)

# List to store extracted dataframes
extracted_datax = []

# Loop through each table in the source database
for table_name, table in metadata.tables.items():
    # Extract data from the current table
    with source_engine.connect() as source_conn:
        extract_query = table.select()
        extracted_data = source_conn.execute(extract_query).fetchall()
        
        # Convert extracted data to DataFrame
        df = pd.DataFrame(extracted_data, columns=table.columns.keys())
        
        # Append DataFrame to the list
        extracted_datax.append(df)

    # Transform data as needed
    #transformed_data = transform_data(extracted_data)

    # Load transformed data into the destination database
    #with dest_engine.connect() as dest_conn:
        #dest_conn.execute(table.insert(), transformed_data)

print("ETL process completed successfully.")
