In [64]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data_url = 'loan_applic.csv'  # Replace this with your dataset URL or local path
df = pd.read_csv(data_url)

# Split the data into features and target variable
X = df.drop(columns=['Application Number', 'Outcome',

                    ])  # Exclude non-predictive columns
X = X.replace(',', '', regex=True)
#X = X.fillna('Missing')
# Encode categorical labels
label_encoder = LabelEncoder()
df['Outcome'] = label_encoder.fit_transform(df['Outcome'])
y = df['Outcome']  # Assuming 'Outcome' is the target variable for regression

# Train the model
cat_features = [
                     'Application: Remittance Frequency',
                     'Primary Contact Gender',
                     'Application: Close Date',
                     'crime_record',
                     'Has Website',
                     'Industry',
                     'Office Space',
                     'Position',
                     'Shipping State',
                     'Type',
]
# Replace NaN values with 'Missing' in specified columns
for col in cat_features:
    X[col] = X[col].fillna('Missing')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=1000,  # Number of trees (boosting iterations)
                          learning_rate=0.1,  # Step size shrinkage used in update to prevent overfitting
                          depth=6,  # Depth of trees
                          loss_function='RMSE',  # Loss function for regression
                          random_state=42)



model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100,
          cat_features=cat_features
         )

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

0:	learn: 0.4412956	test: 0.4396796	best: 0.4396796 (0)	total: 9.44ms	remaining: 9.43s
100:	learn: 0.3971960	test: 0.4219656	best: 0.4219124 (93)	total: 781ms	remaining: 6.95s
200:	learn: 0.3758538	test: 0.4213942	best: 0.4209813 (151)	total: 1.53s	remaining: 6.08s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.420981329
bestIteration = 151

Shrink model to first 152 iterations.
Mean Squared Error: 0.17722527935623264


In [12]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data_url = 'loan_applic.csv'  # Replace this with your dataset URL or local path
df = pd.read_csv(data_url)

In [25]:
nan_counts = df.isna().sum()

# Calculate percentage of NaN values in each column
total_values = df.shape[0]
nan_percentages = (nan_counts / total_values) * 100

# Identify columns with more than 50% NaN values
columns_with_nan = nan_percentages[nan_percentages > 50]

# Print columns with more than 50% NaN values
if not columns_with_nan.empty:
    print("Columns with more than 50% NaN values:")
    print(columns_with_nan)
else:
    print("No columns have more than 50% NaN values.")

Columns with more than 50% NaN values:
crime_record    62.45023
dtype: float64


In [26]:
df=df.drop(columns='crime_record')

In [28]:
# Count unique values in each column
unique_counts = df.apply(pd.Series.nunique)

# Identify constant columns (columns with only one unique value)
constant_columns = unique_counts[unique_counts == 1]

# Print constant columns
if not constant_columns.empty:
    print("Constant columns:")
    print(constant_columns)
else:
    print("No constant columns found.")

Constant columns:
Application: BPA Broker Negotiation    1
dtype: int64


In [30]:
df=df.drop(columns='Application: BPA Broker Negotiation')

In [38]:
features_to_remove = set()

# Iterate through high correlation pairs and add features to remove set
for pair in high_corr_pairs:
    features_to_remove.add(pair[0])  # Add first feature from pair
    features_to_remove.add(pair[1])  # Add second feature from pair

In [44]:
len(features_to_remove)

16

In [46]:
len(numeric_columns)

29

In [47]:
df = df.drop(columns=features_to_remove)

In [37]:
import pandas as pd
import numpy as np

numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Create a new DataFrame with only numeric columns
df = df[numeric_columns]

# Assuming df is your DataFrame containing the dataset
# You can modify the code based on your actual DataFrame

# Calculate Pearson correlation matrix
pearson_corr = df.corr(method='pearson')

# Calculate Spearman correlation matrix
spearman_corr = df.corr(method='spearman')

# Identify highly correlated features
threshold = 0.8  # You can adjust this threshold as needed
high_corr_pairs = []

# Check Pearson correlation
for i in range(len(pearson_corr.columns)):
    for j in range(i):
        if abs(pearson_corr.iloc[i, j]) > threshold:
            high_corr_pairs.append((pearson_corr.columns[i], pearson_corr.columns[j], pearson_corr.iloc[i, j]))

# Check Spearman correlation
for i in range(len(spearman_corr.columns)):
    for j in range(i):
        if abs(spearman_corr.iloc[i, j]) > threshold:
            high_corr_pairs.append((spearman_corr.columns[i], spearman_corr.columns[j], spearman_corr.iloc[i, j]))

# Print highly correlated feature pairs
if high_corr_pairs:
    print("Highly correlated feature pairs:")
    for pair in high_corr_pairs:
        print(pair)
else:
    print("No highly correlated features found.")


Highly correlated feature pairs:
('Amount', 'Application: Funded Amount', 0.9991724186821161)
('Brokers submitted All Time', 'Applications received All Time', 0.858126909603403)
('Brokers submitted last 1 month', 'Applications received by last 1 Month', 0.8672610185664392)
('Brokers submitted last 3 months', 'Applications received by last 3 Months', 0.8601228745235698)
('Brokers submitted last 6 months', 'Applications received by last 6 Months', 0.8431886450007653)
('Brokers submitted last 6 months', 'Brokers submitted last 3 months', 0.803122453898034)
('Factor Rate', 'Application: Buy Rate', 0.8328359698916049)
('Months', 'Days', 0.9883291763807092)
('Satisfactory', 'Number of Trade Lines', 0.9510773184354311)
('Amount', 'Application: Funded Amount', 0.9997013282678524)
('Brokers submitted All Time', 'Applications received All Time', 0.8140475861758882)
('Brokers submitted last 1 month', 'Applications received by last 1 Month', 0.8116814592838457)
('Factor Rate', 'Application: Buy Ra

In [8]:
df = df.drop(columns=['Application Number', 'Outcome'])

In [10]:
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame containing the dataset
# You can modify the code based on your actual DataFrame

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each column
for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column contains categorical data
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# Now all categorical columns are converted to numerical values


In [18]:
df['Application Number'].unique().shape

(9795,)

In [19]:
df['Application Number'].shape

(9795,)

In [23]:
df=df.drop(columns='Application Number')

In [24]:
df

Unnamed: 0,Application: Buy Rate,Application: Funded Amount,Application: BPA Broker Negotiation,Application: Origination Fee,Application: Remittance Frequency,Primary Contact Gender,Application: Close Date,customer Age,Amount,crime_record,...,Satisfactory,Shipping State,Sum of Monthly Personal Debt,Time In Business Actual,Type,Volume - 4 Months Ago,Volume - 6 Months Ago,Volume - Three Months Ago,Yearly Total Sales,Outcome
0,1.250,9000.0,0,2.0,Daily,,1/3/2017,35.0,9000.0,,...,40.0,NJ,4792.0,24.0,New Deal,0.00,0,11605.73,128688.72,Neg
1,1.250,9500.0,0,2.0,Daily,Male,1/3/2017,47.0,9500.0,,...,24.0,OH,2867.0,17.7,New Deal,35205.42,29288.31,30209.32,351830.88,Neg
2,1.250,6000.0,0,2.0,Daily,Male,1/3/2017,58.0,6000.0,,...,15.0,NY,1331.0,17.5,New Deal,6299.58,4799.65,9281.73,132113.4,Neg
3,1.250,15000.0,0,2.0,Daily,Male,1/3/2017,62.0,15000.0,,...,8.0,MN,16491.0,5.1,New Deal,0.00,0,44114.52,448156.8,Pos
4,1.250,14000.0,0,2.0,Daily,Female,1/3/2017,64.0,14000.0,traffic,...,22.0,PA,1373.0,1.5,New Deal,0.00,0,16168.17,205465.32,Pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9790,1.270,200000.0,0,3.0,Daily,Male,12/31/2018,51.0,200000.0,traffic,...,28.0,TX,2400.0,19.8,Paid Renewal,,,152267.44,4913317.56,Pos
9791,1.280,250000.0,0,3.0,Daily,Male,12/31/2018,54.0,250000.0,,...,15.0,LA,7184.0,28.0,Potential Dupe,,,1034011.52,10866517.08,Pos
9792,1.310,36000.0,0,3.0,Daily,Male,12/31/2018,46.0,36000.0,,...,15.0,PA,1865.0,5.6,Paid Renewal,,,,507148.68,Pos
9793,1.375,125000.0,0,3.0,Daily,Male,12/31/2018,54.0,125000.0,traffic,...,13.0,CA,5138.0,18.6,New Deal,207219.75,,134922.09,2118071.88,Pos


In [48]:
df

Unnamed: 0,Application: Origination Fee,customer Age,Average Daily Negatives,Avg Number of Monthly Deposits,Credit Score,Daily Bank Balance v/s Daily Payment,Inquiry Count,Open Bankruptcy,Public Records,Sales to Payment,Sum of Monthly Personal Debt,Time In Business Actual,Volume - 4 Months Ago
0,2.0,35.0,0.00,3.33,741.0,30.9225,24.0,0.0,0.0,18.00,4792.0,24.0,0.00
1,2.0,47.0,0.25,36.50,624.0,24.7196,18.0,0.0,0.0,18.00,2867.0,17.7,35205.42
2,2.0,58.0,0.67,46.67,590.0,10.1844,51.0,0.0,0.0,6.50,1331.0,17.5,6299.58
3,2.0,62.0,0.67,53.33,650.0,11.3280,1.0,0.0,3.0,0.00,16491.0,5.1,0.00
4,2.0,64.0,0.67,7.00,504.0,15.1043,13.0,0.0,0.0,0.00,1373.0,1.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9790,3.0,51.0,0.82,6.55,741.0,55.0047,4.0,0.0,0.0,0.00,2400.0,19.8,
9791,3.0,54.0,4.00,22.00,728.0,91.1508,12.0,0.0,0.0,0.00,7184.0,28.0,
9792,3.0,46.0,0.33,51.33,726.0,18.1586,0.0,0.0,0.0,0.00,1865.0,5.6,
9793,3.0,54.0,1.00,20.00,642.0,8.9881,18.0,1.0,1.0,0.00,5138.0,18.6,207219.75


In [53]:
X = df
#X = X.fillna('Missing')
# Encode categorical labels
label_encoder = LabelEncoder()
df2['Outcome'] = label_encoder.fit_transform(df2['Outcome'])
y = df2['Outcome']  # Assuming 'Outcome' is the target variable for regression

# Train the model
cat_features = [
                     'Application: Close Date',
                     'crime_record',
                     'Has Website',
                     'Industry',
                     'Office Space',
                     'Position',
                     'Shipping State',
                     'Type',
]
# Replace NaN values with 'Missing' in specified columns
for col in cat_features:
    X[col] = X[col].fillna('Missing')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: 'Application: Close Date'

In [50]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data_url = 'loan_applic.csv'  # Replace this with your dataset URL or local path
df2 = pd.read_csv(data_url)
