<a href="https://colab.research.google.com/github/san-0108/student-management/blob/main/AAIT_Retirement_income.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TEAM AAIT-Retirement Savings Estimator
Team members:
Saahil Shaikh, Sachit Desai, Sanjana Sharma, Sehaj Saluja

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


In [None]:
df=pd.read_csv("https://raw.githubusercontent.com/SaahilShaikh17/AAIT-Retirement/main/nedgroup_training_data.csv")
df_validation=pd.read_csv("https://raw.githubusercontent.com/SaahilShaikh17/AAIT-Retirement/main/nedgroup_validation_data.csv")
df_test=pd.read_csv("https://raw.githubusercontent.com/SaahilShaikh17/AAIT-Retirement/main/nedgroup_testing_data.csv")

In [None]:
pd.set_option('display.max_columns', None)


In [None]:
df.head()


In [None]:
df.shape

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)
df_validation.drop('Unnamed: 0',axis=1,inplace=True)
df_test.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

Gender, Financially Support Partner, financially support children, years_supporting_child, Years_supporting_Someone_else, Has_emergency_savings, Confidence_Level, spouse_Gender

So we have 8 categorical variables


## Data Issues

In [None]:
#Check if there are any duplicated values in our dataset
print(df.duplicated().sum())
print(df_validation.duplicated().sum())
print(df_test.duplicated().sum())

In [None]:
df.drop_duplicates(inplace=True)
df_validation.drop_duplicates(inplace=True)

In [None]:

print(df.duplicated().sum())
print(df_validation.duplicated().sum())
print(df_test.duplicated().sum())

In [None]:
df.shape

## EDA



# Data Preprocessing


<h2>Handling Missing Values</h2>

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
# Replace missing values with 'NA' in SPOUSE_GENDER
df['SPOUSE_GENDER'].fillna('NA', inplace=True)
df_test['SPOUSE_GENDER'].fillna('NA', inplace=True)
df_validation['SPOUSE_GENDER'].fillna('NA', inplace=True)

# inserting 0 for null values in spouse retirement age
df['SPOUSE_RETIREMENT_AGE'] = df['SPOUSE_RETIREMENT_AGE'].fillna(0).astype('int64')
df_test['SPOUSE_RETIREMENT_AGE'] = df_test['SPOUSE_RETIREMENT_AGE'].fillna(0).astype('int64')
df_validation['SPOUSE_RETIREMENT_AGE'] = df_validation['SPOUSE_RETIREMENT_AGE'].fillna(0).astype('int64')

In [None]:
df['SPOUSE_RETIREMENT_AGE'].head(15)

In [None]:
df[['SPOUSE_GENDER','SPOUSE_RETIREMENT_AGE','SPOUSE_DATE_OF_BIRTH']].isnull().sum()

In [None]:
df['SPOUSE_DATE_OF_BIRTH'].head(15)

In [None]:
# Convert 'SPOUSE_DATE_OF_BIRTH' to pandas datetime format
df['SPOUSE_DATE_OF_BIRTH'] = pd.to_datetime(df['SPOUSE_DATE_OF_BIRTH'], errors='coerce')
df_test['SPOUSE_DATE_OF_BIRTH'] = pd.to_datetime(df_test['SPOUSE_DATE_OF_BIRTH'], errors='coerce')
df_validation['SPOUSE_DATE_OF_BIRTH'] = pd.to_datetime(df_validation['SPOUSE_DATE_OF_BIRTH'], errors='coerce')


# Display the DataFrame with the converted datetime column
df['SPOUSE_DATE_OF_BIRTH'].head(15)

Converting all DOB to year of birth

In [None]:
df['SPOUSE_DATE_OF_BIRTH'] = df['SPOUSE_DATE_OF_BIRTH'].dt.year.astype('Int64')
df['SPOUSE_DATE_OF_BIRTH'].fillna(0, inplace=True)

df_test['SPOUSE_DATE_OF_BIRTH'] = df_test['SPOUSE_DATE_OF_BIRTH'].dt.year.astype('Int64')
df_test['SPOUSE_DATE_OF_BIRTH'].fillna(0, inplace=True)

df_validation['SPOUSE_DATE_OF_BIRTH'] = df_validation['SPOUSE_DATE_OF_BIRTH'].dt.year.astype('Int64')
df_validation['SPOUSE_DATE_OF_BIRTH'].fillna(0, inplace=True)

In [None]:
df['SPOUSE_DATE_OF_BIRTH'].head()

In [None]:
df[['SPOUSE_GENDER','SPOUSE_RETIREMENT_AGE','SPOUSE_DATE_OF_BIRTH']].isnull().sum()

<h2>Standardisation</h2>

In [None]:
# Extract numerical columns for standardization
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit and transform the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
df_test[numerical_columns] = scaler.fit_transform(df_test[numerical_columns])
df_validation[numerical_columns] = scaler.fit_transform(df_validation[numerical_columns])

In [None]:
objs = df.select_dtypes(include = "object").columns
print(objs)

over here-

GENDER and SPOUSE_GENDER should be one hot encoded and

FINANCIALLY_SUPPORT_PARTNER, FINANCIALLY_SUPPORT_CHILDREN, HAS_EMERGENCY_SAVINGS, HAS_EMERGENCY_SAVINGS, CRITICAL_ILLNESS and SPOUSE_DATE_OF_BIRTH should be label encoded

<h2>Encoding of categorical variables</h2>

In [None]:
# One-hot-encoding
df = pd.get_dummies(df, columns=['GENDER', 'SPOUSE_GENDER'])
df_validation = pd.get_dummies(df_validation, columns=['GENDER', 'SPOUSE_GENDER'])
df_test = pd.get_dummies(df_test, columns=['GENDER', 'SPOUSE_GENDER'])

In [None]:
df_test.info()

In [None]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

for col in objs[1:-1]:
  df[col] = encode.fit_transform(df[col].astype(str))
  df_validation[col] = encode.fit_transform(df_validation[col].astype(str))
  df_test[col] = encode.fit_transform(df_test[col].astype(str))

df_test.head()

In [None]:
df_test.info()

## Feature Selection

## Model Training

In [None]:
X_train= df.drop(['TARGET_MONTHLY_INCOME'],axis=1)
Y_train= df['TARGET_MONTHLY_INCOME']
X_test= df_test.drop(['TARGET_MONTHLY_INCOME'],axis=1)
Y_test=df_test['TARGET_MONTHLY_INCOME']
X_val= df_validation.drop(['TARGET_MONTHLY_INCOME'],axis=1)
Y_val=df_validation['TARGET_MONTHLY_INCOME']
print(Y_train.shape)

Our target variable is Target_monthly_income which the targeted monthly income the customer should have in order to reach their expected retirement fund value

### Decision Trees

In [None]:
final_cols = ['RETIREMENT_AGE', 'RETIREMENT_FUND_VALUE', 'DEPT_VALUE',
       'SPARE_CASH_VALUE', 'OTHER_MONTHLY_SUPPORTING_VALUE',
       'CRITICAL_ILLNESS', 'SPOUSE_GENDER_Female', 'SPOUSE_GENDER_Male', 'SPOUSE_GENDER_NA', 'SPOUSE_RETIREMENT_AGE',
       'SPOUSE_DATE_OF_BIRTH', 'INTERNATIONAL_CASH_UNIT_TRUST',
       'SA_EQUITY_LAP', 'SA_BOND_LAP', 'SA_CASH_LAP', 'INTERNATIONAL_CASH_LAP',
       'LA_EAC_PA_INCL_VAT', 'UNIT_TRUST_EAC_PA_INCL_VAT']

In [None]:
target_variable = 'TARGET_MONTHLY_INCOME'

# splitting using 80:20 on df
x_train, x_test, y_train, y_test = train_test_split(df[final_cols], df[target_variable], test_size=0.8, random_state=42)

# Initialize the Decision Tree Regressor
decision_tree = DecisionTreeRegressor(random_state=42)

# Train the Decision Tree Regressor on the training set
decision_tree.fit(x_train, y_train)

# Make predictions on the validation set
y_pred = decision_tree.predict(x_test)

# # Evaluate the model on the validation set
# mse = mean_squared_error(y_validation, y_validation_pred)
# print(f'Mean Squared Error on Validation Set: {mse}')

### Random Forest Regressor

In [None]:
model=RandomForestRegressor(random_state=42)
model.fit(X_train,Y_train)

In [None]:
Y_pred=model.predict(X_val)

rmse= mean_squared_error(Y_val, Y_pred, squared=False)
mae= mean_absolute_error(Y_val, Y_pred)
r2=r2_score(Y_val, Y_pred)
print("The Root mean squared error is: ",rmse)
print("The Mean absolute error is: ",mae)
print("The R-2 Score is: ",r2)

In [None]:
print("First 10 Predicted vs Real Values:")
for i in range(10):
    print(f"Predicted: {Y_pred[i]:.2f}, Actual: {Y_test.iloc[i]:.2f}")


In [None]:
from sklearn.model_selection import KFold
import numpy as np


# Define the number of folds (f)
f = 5

# Initialize KFold object
kf = KFold(n_splits=f, shuffle=True, random_state=42)

# Initialize empty lists to store performance metrics
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X_train):
    # Split combined data into training and validation sets
    X_fold_train = X_train.iloc[train_index]
    Y_fold_train = Y_train.iloc[train_index]

    X_fold_val = X_train.iloc[val_index]
    Y_fold_val = Y_train.iloc[val_index]

    # Train the model on the training fold
    model = RandomForestRegressor(random_state=42)
    model.fit(X_fold_train, Y_fold_train)

    # Make predictions on the validation fold
    predictions_fold_val = model.predict(X_fold_val)

    # Calculate metrics for the validation fold
    rmse_fold = mean_squared_error(Y_fold_val, predictions_fold_val, squared=False)
    mae_fold = mean_absolute_error(Y_fold_val, predictions_fold_val)
    r2_fold = r2_score(Y_fold_val, predictions_fold_val)

    # Append metrics to the lists
    rmse_scores.append(rmse_fold)
    mae_scores.append(mae_fold)
    r2_scores.append(r2_fold)

# Calculate average metrics across all folds
average_rmse = np.mean(rmse_scores)
average_mae = np.mean(mae_scores)
average_r2 = np.mean(r2_scores)

print('Average RMSE:', average_rmse)
print('Average MAE:', average_mae)
print('Average R2:', average_r2)


In [None]:
# Initialize empty lists to store performance metrics for testing set
test_rmse_scores = []
test_mae_scores = []
test_r2_scores = []

# Perform K-fold cross-validation on the testing set
for train_index, val_index in kf.split(X_test):
    # Split testing data into training and validation sets
    X_fold_train_test = X_test.iloc[train_index]
    Y_fold_train_test = Y_test.iloc[train_index]

    X_fold_val_test = X_test.iloc[val_index]
    Y_fold_val_test = Y_test.iloc[val_index]

    # Train the model on the training fold of testing set
    model_test = RandomForestRegressor(random_state=42)
    model_test.fit(X_fold_train_test, Y_fold_train_test)

    # Make predictions on the validation fold of testing set
    predictions_fold_val_test = model_test.predict(X_fold_val_test)

    # Calculate metrics for the validation fold of testing set
    rmse_fold_test = mean_squared_error(Y_fold_val_test, predictions_fold_val_test, squared=False)
    mae_fold_test = mean_absolute_error(Y_fold_val_test, predictions_fold_val_test)
    r2_fold_test = r2_score(Y_fold_val_test, predictions_fold_val_test)

    # Append metrics to the lists
    test_rmse_scores.append(rmse_fold_test)
    test_mae_scores.append(mae_fold_test)
    test_r2_scores.append(r2_fold_test)

# Calculate average metrics across all folds for testing set
average_test_rmse = np.mean(test_rmse_scores)
average_test_mae = np.mean(test_mae_scores)
average_test_r2 = np.mean(test_r2_scores)

print('Average Test RMSE:', average_test_rmse)
print('Average Test MAE:', average_test_mae)
print('Average Test R2:', average_test_r2)
print(test_r2_scores)

In [None]:
#To get the parameters of the model
model.get_params()

In [None]:
df.shape

In [None]:
tree=RandomForestRegressor(max_depth=25,bootstrap=True,random_state=42)

In [None]:
final_cols = ['RETIREMENT_AGE', 'RETIREMENT_FUND_VALUE', 'DEPT_VALUE',
       'SPARE_CASH_VALUE', 'OTHER_MONTHLY_SUPPORTING_VALUE',
       'CRITICAL_ILLNESS', 'SPOUSE_GENDER_Female', 'SPOUSE_GENDER_Male', 'SPOUSE_GENDER_NA', 'SPOUSE_RETIREMENT_AGE',
       'SPOUSE_DATE_OF_BIRTH', 'INTERNATIONAL_CASH_UNIT_TRUST',
       'SA_EQUITY_LAP', 'SA_BOND_LAP', 'SA_CASH_LAP', 'INTERNATIONAL_CASH_LAP',
       'LA_EAC_PA_INCL_VAT', 'UNIT_TRUST_EAC_PA_INCL_VAT']
df_selected=df[final_cols]

In [None]:
x = df.drop('TARGET_MONTHLY_INCOME', axis=1)
y = df['TARGET_MONTHLY_INCOME']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(y_train.shape)