In [3]:
# Import necessary libraries for data analysis
import pandas as pd
import numpy as np

In [4]:
# Define column names for the DataFrame
col_names = ["empty","T-Day","T-Month","T-Year", "Code", "No", "R-Day","R-Month","R-Year",
         "Origin Court", "Origin Code", "Origin No", "Origin Year", "Specific Case Type",
         "Judge 1", "Judge 2", "Judge 3", "Judge 4", "Judge 5", "Judge 6", "Judge 7",
         "Case coming for", "Case OutCome", "Reason of adjournment", "N-Day","N-Month",
         "N-Year","P-M", "P-F","P-Org.", "D-M", "D-F", "D-Org.", "Legal Rep", "Witness-P", "Witness-D",
         "Crim-Custody","Other Details"]
# Get the length of the column names list
len(col_names)

38

In [None]:
# Load data from an Excel file into a DataFrame using specified column names
sample_data = pd.read_excel("Data/Sample Data.xlsx", names = col_names)

In [None]:
# Drop the initial rows that don't contain data
sample_data.drop(index=[0, 1, 2, 3], inplace = True)

In [None]:
# Reset the index of the DataFrame
sample_data.reset_index(drop = True, inplace = True)

In [None]:
# Drop the "empty" column
sample_data.drop(columns=["empty"], inplace=True)

In [None]:
# Print the shape (dimensions) of the DataFrame
sample_data.shape

In [None]:
# Display information about the DataFrame, including data types and non-null counts
sample_data.info()

In [None]:
# Define a function named 'duplicated_data' that takes a pandas DataFrame 'data' as input.
def duplicated_data(data):

    # Check if there are any duplicated rows in the DataFrame.
    if data.duplicated(subset=None, keep="first").any() == True:

        # Count the number of duplicated rows and store it in 'duplicated_count'.
        duplicated_count = data.duplicated().value_counts()

        # Get the number of duplicates (True value) and store it in 'num_duplicates'.
        num_duplicates = duplicated_count[True]

        # Print the number of duplicates found.
        print(f"{num_duplicates} duplicates found.")

        # Remove duplicates from the DataFrame while keeping the first occurrence.
        data.drop_duplicates(subset=None, keep="first", inplace=True)

        # Print a message indicating that duplicates have been removed.
        print("Duplicates removed.")

        # Reset the index of the DataFrame to be sequential.
        data.reset_index(drop=True, inplace=True)

        # Return the cleaned DataFrame without duplicates.
        return data
    else:
        # If no duplicates are found, print a message indicating so and return the original DataFrame.
        print("No duplicates found.")
        return data

# Assuming 'sample_data' is a pandas DataFrame, call the 'duplicated_data' function to process it.
sample_data = duplicated_data(sample_data)


In [None]:
def missing_values(data):
    # Calculate the total count of missing values for each column and sort them in descending order.
    miss = data.isnull().sum().sort_values(ascending=False)

    # Calculate the percentage of missing values for each column and round the result to three decimal places.
    percentage_miss = (np.round((data.isnull().sum() / len(data)) * 100, 3)).sort_values(ascending=False)

    # Create a DataFrame called 'missing' to store the missing values count and percentage.
    missing = pd.DataFrame({"Missing Values": miss, "Percentage(%)": percentage_miss})

    # Drop rows (columns) where the percentage of missing values is 0.
    missing.drop(missing[missing["Percentage(%)"] == 0].index, inplace=True)

    # Return the 'missing' DataFrame containing information about missing values.
    return missing
missing_data = missing_values(sample_data)
missing_data

In [None]:
# Define a function named 'remove_columns_with_100_percent_missing' that takes a DataFrame 'data' as input.
def remove_columns_with_100_percent_missing(data):
    # Call the 'missing_values' function to calculate missing values for each column in 'data'.
    missing_data = missing_values(data)

    # Filter 'missing_data' to get the column names with 100% missing values (Percentage = 100%).
    columns_to_remove = missing_data[missing_data["Percentage(%)"] == 100].index

    # Remove the columns with 100% missing values from the 'data' DataFrame.
    data.drop(columns=columns_to_remove, inplace=True)

    # Return the modified 'data' DataFrame with 100% missing value columns removed.
    return data

# Apply the 'remove_columns_with_100_percent_missing' function to the 'sample_data' DataFrame.
# This removes columns with 100% missing values from 'sample_data'.
sample_data = remove_columns_with_100_percent_missing(sample_data)


In [None]:
#Applying the missing values function to the sample_data variable
missing_data = missing_values(sample_data)
missing_data

In [8]:
def datetime(data, new_col, day, month, year, date_format, index):
    # Combine the day, month, and year columns into a single string column in 'data'.
    data[new_col] = data[day].astype(str) + "-" + data[month] + "-" + data[year].astype(str)

    # Convert the newly created string column to datetime format, using the specified date format.
    # Any parsing errors will be coerced to NaT (Not-a-Time).
    data[new_col] = pd.to_datetime(data[new_col], format=date_format, errors="coerce")

    # Insert the new datetime column at the specified 'index' position and move the existing column with the same name.
    data.insert(index, new_col, data.pop(new_col))

    # Drop the day, month, and year columns from 'data', as they are no longer needed.
    data.drop(columns=[day, month, year], inplace=True)

    # Return the modified 'data' DataFrame.
    return data


In [None]:
# Apply the 'datetime' function to create datetime columns for "Today's Date"
sample_data = datetime(sample_data, "Today's Date", 'T-Day', 'T-Month', 'T-Year', "%d-%b-%Y", 0)

In [None]:
# Apply the 'datetime' function to create datetime columns for "Registration Date"
sample_data = datetime(sample_data, "Registration Date", "R-Day", "R-Month", "R-Year", "%d-%b-%Y", 3)

In [None]:
# Apply the 'datetime' function to create datetime columns for "Next Court Date"
sample_data = datetime(sample_data, "Next Court Date", "N-Day", "N-Month", "N-Year", "%d-%b-%Y", 9)

In [None]:
# Check the data types of the newly created datetime columns
sample_data.dtypes

In [None]:
# Display value counts for the "Case coming for" column
sample_data["Case coming for"].value_counts()