In [None]:
import pandas as pd
import numpy as np
from ScoringPy import Processing
from pathlib import Path


In [None]:
current_path = Path.cwd()
RowData_path = f"{current_path.parent}\\Data\\RowData\\"


In [None]:
def process_applications():
    # Load the applications data from a Feather file located at RowData_path
    data = pd.read_feather(f"{RowData_path}applications.feather")

    # Calculate the complicity amount: difference between goods price and loan amount
    data["complicity_amount"] = data["goods_price"] - data["loan_amount"]

    # Calculate the complicity percentage: (complicity amount / loan amount) * 100
    data["complicity_percentage"] = (data["complicity_amount"] / data["loan_amount"]) * 100

    # Count the number of non-null external source scores for each row
    # The scores are from ext_source_score_1, ext_source_score_2, and ext_source_score_3
    data['ext_source_count'] = data[["ext_source_score_1", "ext_source_score_2", "ext_source_score_3"]].notnull().sum(axis=1)

    # Return the processed DataFrame
    return data

def process_contacts():
    # Load the contacts data from a Feather file located at RowData_path
    data = pd.read_feather(f"{RowData_path}contacts.feather")

    # Calculate the total number of mobile contact methods available for each row
    # This sums the values of mobile, emp_phone, work_phone, and phone columns
    data['contact_mobile_count'] = data[["mobile", "emp_phone", "work_phone", "phone"]].apply(lambda x: x.sum(), axis=1)

    # Return the processed DataFrame
    return data

def process_creditinfo():

    def overdue_days():

        # Group by 'loan_id' and 'credit_status' to calculate the required stats
        loan_summary = data.groupby(['loan_id', 'credit_status'])['overdue_days'].agg(
            sum_overdue_days='sum',
            max_overdue_days='max'
        ).reset_index()

        # Calculate the total stats for each loan_id
        total_summary = data.groupby('loan_id')['overdue_days'].agg(
            sum_overdue_days='sum',
            max_overdue_days='max'
        ).reset_index()

        # Add a column for credit_status with value 'Total' for overall stats
        total_summary['credit_status'] = 'Total'

        # Concatenate loan_summary with total_summary
        final_summary = pd.concat([loan_summary, total_summary], axis=0).sort_values(by=['loan_id', 'credit_status']).reset_index(drop=True)

        # Pivot the data so that 'credit_status' values become columns
        pivoted_data = final_summary.pivot(index='loan_id', columns='credit_status',
                                           values=['sum_overdue_days', 'max_overdue_days'])

        # Flatten the multi-level columns
        pivoted_data.columns = [f"{stat}_{status}" for stat, status in pivoted_data.columns]

        # Reset the index for a cleaner DataFrame
        pivoted_data = pivoted_data.reset_index()

        return pivoted_data


    def loan_timings():
        # Grouping the data by 'loan_id' to calculate statistics related to 'days_since_credit'
        result = data.groupby('loan_id')['days_since_credit'].agg(
            days_since_first_loan_landed=lambda x: x.min(),  # Oldest loan date (minimum value of 'days_since_credit')
            days_since_last_loan_landed=lambda x: x.max(),   # Most recent loan date (maximum value of 'days_since_credit')
            avg_days_between_landing_loans=lambda x: x.sort_values().diff().mean()  # Average days between consecutive loans
        ).reset_index()  # Resetting index to make 'loan_id' a column in the result


        # Grouping data where 'days_since_end' is not null to calculate similar statistics
        result2 = data[data["days_since_end"].notna()].groupby('loan_id')['days_since_end'].agg(
            days_since_first_loan_ended=lambda x: x.min(),  # Oldest loan end date (minimum value of 'days_since_end')
            days_since_last_loan_ended=lambda x: x.max()    # Most recent loan end date (maximum value of 'days_since_end')
        ).reset_index()  # Resetting index for the second result set


        # Merging the two result sets on 'loan_id', keeping all rows from the first set
        combined_result = pd.merge(result, result2, on='loan_id', how='left')

        # Returning the final DataFrame containing the combined results
        return combined_result


    def historical_loans_count():
        # Group by 'loan_id' and 'credit_status', then count occurrences
        grouped_data = data.groupby(['loan_id', 'credit_status']).size().reset_index(name='count')

        # Calculate the total count for each 'loan_id'
        grouped_data['total_count'] = grouped_data.groupby('loan_id')['count'].transform('sum')

        # Pivot the data to transform 'credit_status' values into columns
        pivoted_data = grouped_data.pivot(index='loan_id', columns='credit_status', values='count').reset_index()

        # Add the total_count column
        pivoted_data['total_count'] = grouped_data.drop_duplicates(subset='loan_id')['total_count'].values

        # Fill missing values with 0 (if any credit_status is missing for a loan_id)
        pivoted_data = pivoted_data.fillna(0)

        pivoted_data.rename(columns={'Active': 'Active_loans_count',
                                     'Bad debt': 'Bad_loans_count',
                                     'Closed': 'Closed_loans_count',
                                     'Sold': 'Sold_loans_count',
                                     "total_count":"Total_loans_count"}, inplace=True)

        # Display the result
        return pivoted_data


    def historical_loans_types():
        # Group by 'loan_id' and 'credit_status', then count occurrences
        grouped_data = data.groupby(['loan_id', 'credit_type']).size().reset_index(name='count')

        # Calculate the total count for each 'loan_id'
        grouped_data['total_count'] = grouped_data.groupby('loan_id')['count'].transform('sum')

        # Pivot the data to transform 'credit_status' values into columns
        pivoted_data = grouped_data.pivot(index='loan_id', columns='credit_type', values='count').reset_index()

        # Add the total_count column
        pivoted_data['total_count'] = grouped_data.drop_duplicates(subset='loan_id')['total_count'].values

        # Fill missing values with 0 (if any credit_status is missing for a loan_id)
        pivoted_data = pivoted_data.fillna(0)

        # Display the result
        return pivoted_data


    def calculate_loan_stats():
        # Predefined list of columns for which statistics will be calculated
        columns = ['credit_amount', 'debt_amount', 'credit_limit', 'overdue_amount','annuity_amount','prolong_count','max_overdue_amount']

        result = data[['loan_id']].drop_duplicates()  # Start with unique loan IDs


        for column in columns:
            # Grouping by 'loan_id' and calculating statistics for each column
            stats = data.groupby('loan_id')[column].agg(
                avg_value='mean',  # Average of the column
                max_value='max',   # Maximum value
                min_value=lambda x: x[x > 0].min() if (x > 0).any() else x.min()   # Minimum value

            ).reset_index()  # Resetting index to make 'loan_id' a regular column

            # Rename columns to indicate which column they belong to
            stats = stats.rename(columns={
                'avg_value': f'{column}_avg',
                'max_value': f'{column}_max',
                'min_value': f'{column}_min',
            })

            # Merge the current stats with the result DataFrame on 'loan_id'
            result = result.merge(stats, on='loan_id', how='left')

        return result




    # Load the data and create initial DataFrame with unique loan IDs
    data = pd.read_feather(f"{RowData_path}creditinfo.feather")
    result = data[['loan_id']].drop_duplicates()

    # Step 1: Calculate overdue days stats
    overdue_data = overdue_days()
    result = result.merge(overdue_data, on='loan_id', how='left')
    # Comment: Merging overdue statistics to include both sum and max overdue days for different credit statuses.

    # Step 2: Calculate loan timings
    loan_timing_data = loan_timings()
    result = result.merge(loan_timing_data, on='loan_id', how='left')
    # Comment: Adding loan timing data to get insights on first/last loan landed and ended.

    # Step 3: Calculate historical loan counts by status
    loan_count_data = historical_loans_count()
    result = result.merge(loan_count_data, on='loan_id', how='left')
    # Comment: Including the count of loans by their credit status (Active, Closed, Bad Debt, Sold).

    # Step 4: Calculate historical loan types
    loan_type_data = historical_loans_types()
    result = result.merge(loan_type_data, on='loan_id', how='left')
    # Comment: Including the count of different loan types per loan_id.

    # Step 5: Calculate loan financial statistics
    loan_stats_data = calculate_loan_stats()
    result = result.merge(loan_stats_data, on='loan_id', how='left')
    # Comment: Adding financial stats like average, max, and min for different financial attributes of loans.

    return result

def process_documentation():
    # Load the contacts data from a Feather file located at RowData_path
    data = pd.read_feather(f"{RowData_path}documentations.feather")
    
    # Sum the values across the doc columns for each row
    data['contact_mobile_count'] = data[[f"doc_{i}" for i in range(1, 21)] ].apply(lambda x: x.sum(), axis=1)
    
    return  data


In [144]:
data = pd.read_feather(f"{RowData_path}documentations.feather")

In [145]:
data[data["loan_id"] == 100002]

Unnamed: 0,loan_id,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,...,doc_11,doc_12,doc_13,doc_14,doc_15,doc_16,doc_17,doc_18,doc_19,doc_20
0,100002,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [148]:
# Load the contacts data from a Feather file located at RowData_path
data = pd.read_feather(f"{RowData_path}documentations.feather")

# Sum the values across the doc columns for each row
data['contact_mobile_count'] = data[[f"doc_{i}" for i in range(1, 21)] ].apply(lambda x: x.sum(), axis=1)

data

Unnamed: 0,loan_id,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,...,doc_12,doc_13,doc_14,doc_15,doc_16,doc_17,doc_18,doc_19,doc_20,contact_mobile_count
0,100002,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
1,100003,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
2,100004,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,0
3,100006,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
4,100007,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,1
307507,456252,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
307508,456253,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1
307509,456254,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1


In [None]:




















data = pd.read_feather(f"{RowData_path}credit_card_balance.feather")
data[data["loan_id"] == 214804]
