In [45]:
import pandas as pd

In [46]:
file_path = '03_Library Systembook.csv'
df = pd.read_csv(file_path)
print(df.head())

    Id                                     Books Book checkout Book Returned  \
0  1.0                       Catcher in the Rye   "20/02/2023"    25/02/2023   
1  2.0          Lord of the rings the two towers  "24/03/2023"    21/03/2023   
2  3.0  Lord of the rings the return of the kind  "29/03/2023"    25/03/2023   
3  4.0                                The hobbit  "02/04/2023"    25/03/2023   
4  5.0                                     Dune   "02/04/2023"    25/03/2023   

  Days allowed to borrow  Customer ID  
0                2 weeks          1.0  
1                2 weeks          2.0  
2                2 weeks          3.0  
3                2 weeks          4.0  
4                2 weeks          5.0  


In [47]:
print(df.dtypes)

Id                        float64
Books                      object
Book checkout              object
Book Returned              object
Days allowed to borrow     object
Customer ID               float64
dtype: object


In [49]:
#can strip the extra quotes using str.replace() or str.strip() before converting:
df['Book checkout'] = df['Book checkout'].str.replace('"', '', regex=False)


In [50]:
print(df)

      Id                                     Books Book checkout  \
0    1.0                       Catcher in the Rye     20/02/2023   
1    2.0          Lord of the rings the two towers    24/03/2023   
2    3.0  Lord of the rings the return of the kind    29/03/2023   
3    4.0                                The hobbit    02/04/2023   
4    5.0                                     Dune     02/04/2023   
..   ...                                       ...           ...   
109  NaN                                       NaN           NaN   
110  NaN                                       NaN           NaN   
111  NaN                                       NaN           NaN   
112  NaN                                       NaN           NaN   
113  NaN                                       NaN           NaN   

    Book Returned Days allowed to borrow  Customer ID  
0      25/02/2023                2 weeks          1.0  
1      21/03/2023                2 weeks          2.0  
2      25/03/20

In [51]:
import pandas as pd
from datetime import datetime
import calendar

# Global list to store logs of fixed dates (to be printed later)
fixed_rows = []

def fix_invalid_date(date_str):
    # Skip NaN or invalid types
    if pd.isna(date_str) or not isinstance(date_str, str):
        return pd.NaT

    try:
        # Try to parse the date normally
        return datetime.strptime(date_str, "%d/%m/%Y")
    except ValueError:
        # Handle invalid dates like 32/05/2023
        try:
            day, month, year = map(int, date_str.split('/'))
            # Get the last valid day for the given month/year
            last_day = calendar.monthrange(year, month)[1]
            # Use last valid day if day is invalid
            if day > last_day:
                # Log the original and corrected date
                fixed_rows.append((date_str, f"{last_day}/{month}/{year}"))
                day = last_day
            return datetime(year, month, day)
        except Exception:
            # If still invalid, return NaT
            return pd.NaT

# Function to clean and fix the date column in the DataFrame
def clean_and_fix_dates(df, column_name):
    # Apply the date fixing function to the column
    df[column_name] = df[column_name].apply(fix_invalid_date)
    
    # Print log of fixed rows
    if fixed_rows:
        print("The following dates were fixed:")
        for old_date, new_date in fixed_rows:
            print(f"Fixed: {old_date} -> {new_date}")
    else:
        print("No invalid dates were found.")
    
    return df

# Example usage:
# df = pd.read_csv('your_data.csv')  # Load your dataframe
# df = clean_and_fix_dates(df, 'book checkout')

# To check the cleaned DataFrame and the logs
# print(df.head())


# Example usage:
# df = pd.read_csv('your_data.csv')  # Load your dataframe
# df = clean_and_fix_dates(df, 'book checkout')

# To check the cleaned DataFrame and the logs
# print(df.head())


In [40]:
print(df.dtypes)

Id                        float64
Books                      object
Book checkout              object
Book Returned              object
Days allowed to borrow     object
Customer ID               float64
dtype: object


Explanation of Changes:

pd.isna(date_str): This checks if the value is NaN (not a number). If the date is missing or invalid (e.g., NaN), we return NaT (Not a Time).

not isinstance(date_str, str): This checks if the value is a string before attempting to parse it. If it's not a string (for example, a float), we simply skip parsing and return NaT.

NaT: This ensures that invalid or missing dates are handled gracefully.

How it Works:

The function will now handle NaN or missing values in the book checkout column and will not attempt to parse them.

If the value is valid, it will be converted to a datetime.

Invalid values like "32/05/2023" will be fixed to "31/05/2023", and these fixes will be logged.

If the column contains non-date values or invalid types (like float), they will be skipped and marked as NaT.

In [52]:
df = clean_and_fix_dates(df, 'Book checkout')
# To check the cleaned DataFrame and the logs
print(df.head())

The following dates were fixed:
Fixed: 32/05/2023 -> 31/5/2023
    Id                                     Books Book checkout Book Returned  \
0  1.0                       Catcher in the Rye     2023-02-20    25/02/2023   
1  2.0          Lord of the rings the two towers    2023-03-24    21/03/2023   
2  3.0  Lord of the rings the return of the kind    2023-03-29    25/03/2023   
3  4.0                                The hobbit    2023-04-02    25/03/2023   
4  5.0                                     Dune     2023-04-02    25/03/2023   

  Days allowed to borrow  Customer ID  
0                2 weeks          1.0  
1                2 weeks          2.0  
2                2 weeks          3.0  
3                2 weeks          4.0  
4                2 weeks          5.0  


In [54]:
#can strip the extra quotes using str.replace() or str.strip() before converting:
df['Book Returned'] = df['Book Returned'].str.replace('"', '', regex=False)

In [55]:
df = clean_and_fix_dates(df, 'Book Returned')
# To check the cleaned DataFrame and the logs
print(df.head())

The following dates were fixed:
Fixed: 32/05/2023 -> 31/5/2023
    Id                                     Books Book checkout Book Returned  \
0  1.0                       Catcher in the Rye     2023-02-20    2023-02-25   
1  2.0          Lord of the rings the two towers    2023-03-24    2023-03-21   
2  3.0  Lord of the rings the return of the kind    2023-03-29    2023-03-25   
3  4.0                                The hobbit    2023-04-02    2023-03-25   
4  5.0                                     Dune     2023-04-02    2023-03-25   

  Days allowed to borrow  Customer ID  
0                2 weeks          1.0  
1                2 weeks          2.0  
2                2 weeks          3.0  
3                2 weeks          4.0  
4                2 weeks          5.0  


In [56]:
# Function to validate that checkout date > returned date
def validate_checkout_return_dates(df, checkout_col, returned_col):
    # Convert both columns to datetime if not already
    df[checkout_col] = pd.to_datetime(df[checkout_col], errors='coerce')
    df[returned_col] = pd.to_datetime(df[returned_col], errors='coerce')

    # Create DataFrame to store rows where validation fails (checkout <= returned)
    invalid_dates = df[df[checkout_col] <= df[returned_col]]
    return invalid_dates

# Main function to clean and validate data
def clean_and_validate_data(df, checkout_col, returned_col, customer_id_col):
    # Step 1: Clean and fix the dates in the checkout column
    # df[checkout_col] = df[checkout_col].apply(fix_invalid_date)
    # df[returned_col] = df[returned_col].apply(fix_invalid_date)

    # Step 2: Perform the checkout > returned date validation
    invalid_dates = validate_checkout_return_dates(df, checkout_col, returned_col)

    # Step 3: Identify rows with NaN Customer ID
    df_to_be_checked = df[df[customer_id_col].isna()]

    # Step 4: Remove rows with NaN Customer ID from the main DataFrame
    df_cleaned = df.dropna(subset=[customer_id_col])

    # Step 5: Return cleaned data, rows with invalid dates, and rows with missing customer IDs
    return df_cleaned, invalid_dates, df_to_be_checked

# Example Usage:
# df = pd.read_csv('your_data.csv')  # Load your dataframe

# Specify column names
checkout_col = 'Book checkout'
returned_col = 'Book Returned'
customer_id_col = 'Customer ID'

# Apply the cleaning and validation function
df_cleaned, invalid_dates, df_to_be_checked = clean_and_validate_data(df, checkout_col, returned_col, customer_id_col)

# Check the results
print("Cleaned DataFrame:\n", df_cleaned.head())
print("\nRows with Invalid Dates (checkout <= returned):\n", invalid_dates.head())
print("\nRows with Missing Customer ID (NaN):\n", df_to_be_checked.head())

Cleaned DataFrame:
     Id                                     Books Book checkout Book Returned  \
0  1.0                       Catcher in the Rye     2023-02-20    2023-02-25   
1  2.0          Lord of the rings the two towers    2023-03-24    2023-03-21   
2  3.0  Lord of the rings the return of the kind    2023-03-29    2023-03-25   
3  4.0                                The hobbit    2023-04-02    2023-03-25   
4  5.0                                     Dune     2023-04-02    2023-03-25   

  Days allowed to borrow  Customer ID  
0                2 weeks          1.0  
1                2 weeks          2.0  
2                2 weeks          3.0  
3                2 weeks          4.0  
4                2 weeks          5.0  

Rows with Invalid Dates (checkout <= returned):
       Id                Books Book checkout Book Returned  \
0    1.0  Catcher in the Rye     2023-02-20    2023-02-25   
5    6.0         Little Women    2023-04-02    2023-05-01   
8    9.0             Catch

In [57]:
file_path2 = '03_Library SystemCustomers.csv'

# Read the CSV file into a pandas DataFrame
df2 = pd.read_csv(file_path2)

# Display the first 5 rows of the DataFrame
print(df2.head())

   Customer ID   Customer Name
0          1.0        Jane Doe
1          2.0      John Smith
2          3.0      Dan Reeves
3          NaN             NaN
4          5.0  William Holden


In [58]:
def drop_nan_customer_id(df):
    """
    Drop rows where 'Customer ID' is NaN from the DataFrame.
    
    Parameters:
    - df: The DataFrame to clean.

    Returns:
    - A new DataFrame with rows containing NaN in 'Customer ID' removed.
    """
    # Drop rows where 'Customer ID' is NaN
    cleaned_df = df.dropna(subset=['Customer ID'])
    
    return cleaned_df

In [59]:
# Clean the data by removing rows where Customer ID is NaN
df2_cleaned = drop_nan_customer_id(df2)

# Check the result
print(df2_cleaned.head())

   Customer ID   Customer Name
0          1.0        Jane Doe
1          2.0      John Smith
2          3.0      Dan Reeves
4          5.0  William Holden
5          6.0   Jaztyn Forest


In [64]:
import pandas as pd
from sqlalchemy import create_engine
import pyodbc

# Connection details
server = 'localhost'  # SQL Server address (use localhost if it's on the same machine)
database = 'master'  # Connect to the master database first to create the new database

# Define the connection string
connection_string = (
    'mssql+pyodbc:///?'
    'Driver=ODBC Driver 17 for SQL Server;'
    f'Server={server};'
    f'Database={database};'
    'Trusted_Connection=yes;'
)

# Function to create a new database
def create_database(engine, db_name):
    try:
        with engine.connect() as conn:
            conn.execute(f"IF NOT EXISTS (SELECT * FROM sys.databases WHERE name = '{db_name}') "
                         f"BEGIN CREATE DATABASE {db_name} END")
        print(f"Database '{db_name}' is ready.")
    except Exception as e:
        print(f"Error creating database: {e}")

# Function to upload DataFrames to SQL Server
def upload_dataframe_to_sql(df, table_name, engine):
    try:
        df.to_sql(table_name, con=engine, if_exists='replace', index=False)
        print(f"DataFrame uploaded successfully to '{table_name}' table.")
    except Exception as e:
        print(f"Error uploading DataFrame to {table_name}: {e}")

# Try to connect to the SQL Server and create the 'tempdb_qa' database
try:
    engine = create_engine(connection_string)
    
    # Check if the connection is successful by executing a simple query
    with engine.connect() as conn:
        result = conn.execute("SELECT 1")
        print("Successfully connected to the SQL Server.")
    
    # Create the 'tempdb_qa' database if it doesn't exist
    create_database(engine, 'tempdb_qa')

    # Now, let's connect to the 'tempdb_qa' database
    connection_string = (
        'mssql+pyodbc:///?'
        'Driver=ODBC Driver 17 for SQL Server;'
        f'Server={server};'
        f'Database=tempdb_qa;'
        'Trusted_Connection=yes;'
    )
    engine = create_engine(connection_string)
    
    # Sample DataFrames (df and df2) - replace these with your actual DataFrames
    df = pd.DataFrame({
        'Id': [1, 2, 3],
        'Books': ['Book A', 'Book B', 'Book C'],
        'Book checkout': ['01/01/2022', '02/01/2022', '03/01/2022'],
        'Book Returned': ['05/01/2022', '06/01/2022', '07/01/2022']
    })

    df2 = pd.DataFrame({
        'Customer ID': [101, 102, None],
        'Customer Name': ['John Doe', 'Jane Doe', 'Alice Smith']
    })

    # Upload DataFrames to the SQL Server
    upload_dataframe_to_sql(df, 'df_table', engine)
    upload_dataframe_to_sql(df2, 'df2_table', engine)

except Exception as e:
    print(f"Error connecting to the SQL Server: {e}")


Error connecting to the SQL Server: (pyodbc.InterfaceError) ('IM002', '[IM002] [Microsoft][ODBC Driver Manager] Data source name not found and no default driver specified (0) (SQLDriverConnect)')
(Background on this error at: https://sqlalche.me/e/20/rvf5)


  engine = create_engine(connection_string)


In [61]:
import pyodbc
print(pyodbc.drivers())


['SQL Server', 'SQL Server Native Client RDA 11.0', 'ODBC Driver 17 for SQL Server', 'Microsoft Access Driver (*.mdb, *.accdb)', 'Microsoft Excel Driver (*.xls, *.xlsx, *.xlsm, *.xlsb)', 'Microsoft Access Text Driver (*.txt, *.csv)']


In [None]:
# Upload df DataFrame to SQL Server
df.to_sql('df_table', con=engine, if_exists='replace', index=False)

# Upload df2 DataFrame to SQL Server
df2.to_sql('df2_table', con=engine, if_exists='replace', index=False)

print("DataFrames uploaded successfully to tempdb_qa database!")