In [19]:
import numpy as np
import pandas as pd
from scipy import stats
import random

# Define ranges for each feature

age_range = [18, 80]
height_range = [140, 210]
weight_range = [40, 150]
income_range = [10000, 200000]
savings_range = [0, 500000]
debt_range = [0, 200000]

# Generate random data for each feature

age = np.random.randint(age_range[0], age_range[1]+1, size=1000)
height = np.random.randint(height_range[0], height_range[1]+1, size=1000)
weight = np.random.randint(weight_range[0], weight_range[1]+1, size=1000)
income = np.random.randint(income_range[0], income_range[1]+1, size=1000)
savings = np.random.randint(savings_range[0], savings_range[1]+1, size=1000)
debt = np.random.randint(debt_range[0], debt_range[1]+1, size=1000)

# Combine all features into a single dataframe

data = pd.DataFrame({'Age': age, 'Height': height, 'Weight': weight,
                     'Income': income, 'Savings': savings, 'Debt': debt})

# Print the first few rows of the data

print("Before Normalization")
print(data.head())

# Normalize the data using min-max normalization

age_norm = (age - age_range[0]) / (age_range[1] - age_range[0])
height_norm = (height - height_range[0]) / (height_range[1] - height_range[0])
weight_norm = (weight - weight_range[0]) / (weight_range[1] - weight_range[0])
income_norm = (income - income_range[0]) / (income_range[1] - income_range[0])
savings_norm = (savings - savings_range[0]) / (savings_range[1] - savings_range[0])
debt_norm = (debt - debt_range[0]) / (debt_range[1] - debt_range[0])

# Combine all normalized features into a single dataframe

data = pd.DataFrame({'Age': age_norm, 'Height': height_norm, 'Weight': weight_norm,
                     'Income': income_norm, 'Savings': savings_norm, 'Debt': debt_norm})
print("After Normalization")
print(data.head())

# Add some outliers to the data

for i in range(20):
    # Choose a random feature
    feature = random.choice(data.columns)
    # Choose a random row
    row = random.randint(0, len(data)-1)
    # Generate an extreme value that is outside the normal range for that feature
    extreme_value = random.uniform(1.5, 3) * data[feature].max()
    # Replace the value in the chosen row with the extreme value
    data.at[row, feature] = extreme_value
    
# Remove outliers using the z-score (same code as before)
z = np.abs(stats.zscore(data))
data_no_outliers = data[(z < 3).all(axis=1)]

# Print the number of rows before and after removing outliers
print(f"Number of rows before removing outliers: {len(data)}")
print(f"Number of rows after removing outliers: {len(data_no_outliers)}")
print(data_no_outliers.head())


#After Generating, Normalizing & filtering outliers, our data is ready for processing

Before Normalization
   Age  Height  Weight  Income  Savings    Debt
0   40     200      78   46024   136991  115523
1   62     193     143  148660   282685    2757
2   70     207     142  104976   178812  132024
3   19     153     135   83153   284193  124267
4   40     191     133  197929   111890  172794
After Normalization
        Age    Height    Weight    Income   Savings      Debt
0  0.354839  0.857143  0.345455  0.189600  0.273982  0.577615
1  0.709677  0.757143  0.936364  0.729789  0.565370  0.013785
2  0.838710  0.957143  0.927273  0.499874  0.357624  0.660120
3  0.016129  0.185714  0.863636  0.385016  0.568386  0.621335
4  0.354839  0.728571  0.845455  0.989100  0.223780  0.863970
Number of rows before removing outliers: 1000
Number of rows after removing outliers: 986
        Age    Height    Weight    Income   Savings      Debt
0  0.354839  0.857143  0.345455  0.189600  0.273982  0.577615
1  0.709677  0.757143  0.936364  0.729789  0.565370  0.013785
2  0.838710  0.957143  

In [25]:
from sklearn.model_selection import train_test_split

# Split data into features (X) and target variable (y)
X = data_no_outliers.drop('Income', axis=1)
Y = data_no_outliers['Income']

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Training Set:")
print(X_train.head())
print(Y_train.head())

print("Testing Set:")
print(X_test.head())
print(Y_test.head())


Training Set:
          Age    Height    Weight   Savings      Debt
771  0.241935  0.028571  0.454545  0.898302  0.853845
337  0.403226  0.671429  0.945455  0.717228  0.051105
901  0.403226  0.485714  0.690909  0.533814  0.677650
534  0.161290  0.300000  0.090909  0.802218  0.385085
472  0.564516  0.128571  0.909091  0.869202  0.165930
771    0.094326
337    0.324984
901    0.143995
534    0.432884
472    0.068942
Name: Income, dtype: float64
Testing Set:
          Age    Height    Weight   Savings      Debt
621  0.080645  0.157143  0.836364  0.276248  0.648300
455  0.870968  0.114286  0.772727  0.636368  0.228195
740  1.000000  0.642857  0.509091  0.725618  0.553655
440  0.403226  0.257143  0.200000  0.756976  0.313015
278  0.822581  0.914286  0.063636  0.678100  0.041565
621    0.749279
455    0.253795
740    0.611411
440    0.458689
278    0.032405
Name: Income, dtype: float64
