# Income Prediction Using Decision Tree Regression

## Loading The Main Libraries

In [225]:
import numpy as np
import pandas as pd

## Data Collection

In [226]:
data = pd.read_csv('/content/data.csv')

In [227]:
data.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210


## EDA

In [228]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Age                             10000 non-null  int64 
 1   Education_Level                 10000 non-null  object
 2   Occupation                      10000 non-null  object
 3   Number_of_Dependents            10000 non-null  int64 
 4   Location                        10000 non-null  object
 5   Work_Experience                 10000 non-null  int64 
 6   Marital_Status                  10000 non-null  object
 7   Employment_Status               10000 non-null  object
 8   Household_Size                  10000 non-null  int64 
 9   Homeownership_Status            10000 non-null  object
 10  Type_of_Housing                 10000 non-null  object
 11  Gender                          10000 non-null  object
 12  Primary_Mode_of_Transportation  10000 non-null 

In [229]:
data.isna().sum()

Unnamed: 0,0
Age,0
Education_Level,0
Occupation,0
Number_of_Dependents,0
Location,0
Work_Experience,0
Marital_Status,0
Employment_Status,0
Household_Size,0
Homeownership_Status,0


In [230]:
data.nunique()

Unnamed: 0,0
Age,53
Education_Level,4
Occupation,5
Number_of_Dependents,6
Location,3
Work_Experience,51
Marital_Status,3
Employment_Status,3
Household_Size,7
Homeownership_Status,2


In [231]:
for col in data.drop('Income' , axis = 1).columns:
  print(data[col].value_counts())

Age
43    223
66    216
62    215
40    212
64    209
34    209
52    207
45    205
38    203
35    200
49    200
30    200
46    199
42    197
19    196
29    194
21    194
39    194
56    192
68    192
57    191
59    191
32    191
54    191
37    190
41    189
51    189
22    188
61    188
25    188
20    188
33    187
53    186
23    185
36    183
47    183
55    183
28    182
70    182
50    182
69    178
26    177
18    176
65    175
31    175
67    174
27    174
58    173
63    169
60    162
24    161
44    159
48    153
Name: count, dtype: int64
Education_Level
Bachelor's     4058
High School    2959
Master's       2482
Doctorate       501
Name: count, dtype: int64
Occupation
Healthcare    3035
Technology    2407
Finance       1525
Others        1521
Education     1512
Name: count, dtype: int64
Number_of_Dependents
5    1745
3    1712
1    1651
0    1642
4    1629
2    1621
Name: count, dtype: int64
Location
Urban       7037
Suburban    1951
Rural       1012
Name: count, dtype:

## Data Preprocessing

### Handling the missing values

      - No missing values

### Encoding the categorical variables

In [232]:
categorical_cols = data.select_dtypes( include = 'object' ).columns

display ( categorical_cols )

Index(['Education_Level', 'Occupation', 'Location', 'Marital_Status',
       'Employment_Status', 'Homeownership_Status', 'Type_of_Housing',
       'Gender', 'Primary_Mode_of_Transportation'],
      dtype='object')

      ▶ categorical_cols = [ Occupation  ,  Location  ,  Marital_Status  , Employment_Status  ,  Homeownership_Status  ,

      Type_of_Housing  ,  Gender  ,  Primary_Mode_of_Transportation  ,  Education_Level  ]
      

      ▶ Encode the Education_Level using Ordinal Encoding  because it's an ordinal variable


      ▶ Encode the reminder Using OneHot Encoding method

      ▶  First , We will Encode the Education_Level using Ordinal Encoding because it's an ordinal variable

In [233]:
data['Education_Level'].value_counts()

Unnamed: 0_level_0,count
Education_Level,Unnamed: 1_level_1
Bachelor's,4058
High School,2959
Master's,2482
Doctorate,501


In [234]:
data['Education_Level'].head(10)

Unnamed: 0,Education_Level
0,Master's
1,High School
2,Bachelor's
3,High School
4,Bachelor's
5,High School
6,Master's
7,Master's
8,Master's
9,Bachelor's


In [235]:
data['Education_Level'].tail(10)

Unnamed: 0,Education_Level
9990,Bachelor's
9991,Bachelor's
9992,Master's
9993,Bachelor's
9994,High School
9995,High School
9996,Master's
9997,Doctorate
9998,High School
9999,High School


In [236]:
# Encode the Education_Level using Ordinal Encoding method because it's an ordinal variable

EducationLevelMap = {
    "High School": 0,
    "Bachelor's": 1,
    "Master's": 2,
    "Doctorate": 3
}

data['Education_Level'] = data['Education_Level'].map( EducationLevelMap )

In [237]:
data['Education_Level'].head(10)

Unnamed: 0,Education_Level
0,2
1,0
2,1
3,0
4,1
5,0
6,2
7,2
8,2
9,1


In [238]:
data['Education_Level'].tail(10)

Unnamed: 0,Education_Level
9990,1
9991,1
9992,2
9993,1
9994,0
9995,0
9996,2
9997,3
9998,0
9999,0


      ▶ Encoding the reminder categorical variables using OneHot Encoding

      ▶ No need to drop any dummy columns after getting the dummy columns from the OneHot because the Decesion Tree Regression

       isn't affected by the multicollinearity

In [239]:
categorical_cols = data.select_dtypes( include = 'object' ).columns

#categorical_cols

In [240]:
data = pd.get_dummies( data , columns = categorical_cols , dtype = int )      # OneHot Encoding for the reminder categorical columns

In [241]:
data.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Income,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,...,Homeownership_Status_Rent,Type_of_Housing_Apartment,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking
0,56,2,5,21,7,72510,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,69,0,0,4,7,75462,0,1,0,0,...,0,1,0,0,0,1,1,0,0,0
2,46,1,1,1,7,71748,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
3,32,0,2,32,1,74520,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
4,60,1,3,15,4,640210,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1


In [242]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 31 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   Age                                            10000 non-null  int64
 1   Education_Level                                10000 non-null  int64
 2   Number_of_Dependents                           10000 non-null  int64
 3   Work_Experience                                10000 non-null  int64
 4   Household_Size                                 10000 non-null  int64
 5   Income                                         10000 non-null  int64
 6   Occupation_Education                           10000 non-null  int64
 7   Occupation_Finance                             10000 non-null  int64
 8   Occupation_Healthcare                          10000 non-null  int64
 9   Occupation_Others                              10000 non-null  int64
 10 

.

### Splitting the Data into input & output

In [243]:
x = data.drop('Income' , axis = 1)
y = data['Income']

In [244]:
x.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,Occupation_Technology,...,Homeownership_Status_Rent,Type_of_Housing_Apartment,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking
0,56,2,5,21,7,0,0,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,69,0,0,4,7,0,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0
2,46,1,1,1,7,0,0,0,0,1,...,0,0,1,0,1,0,0,1,0,0
3,32,0,2,32,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
4,60,1,3,15,4,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,1


In [245]:
y.head()

Unnamed: 0,Income
0,72510
1,75462
2,71748
3,74520
4,640210


.

### Splitting the data into training and testing

In [246]:
from sklearn.model_selection import train_test_split

In [247]:
x_train , x_test , y_train , y_test = train_test_split ( x , y , test_size = 0.2 , random_state = 0 )

In [248]:
x_train.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,Occupation_Technology,...,Homeownership_Status_Rent,Type_of_Housing_Apartment,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking
7389,42,0,1,10,2,0,0,0,1,0,...,1,1,0,0,0,1,1,0,0,0
9275,19,1,0,21,2,0,0,1,0,0,...,1,1,0,0,0,1,0,0,1,0
2995,49,2,3,10,4,0,0,0,1,0,...,1,0,0,1,0,1,0,0,1,0
5316,68,2,4,5,5,0,0,0,1,0,...,1,0,1,0,0,1,1,0,0,0
356,39,1,4,37,6,0,0,0,1,0,...,0,1,0,0,1,0,1,0,0,0


In [249]:
y_train.head()

Unnamed: 0,Income
7389,71692
9275,279059
2995,280479
5316,2280108
356,2771585


In [250]:
x_test.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,Occupation_Technology,...,Homeownership_Status_Rent,Type_of_Housing_Apartment,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking
9394,69,0,3,34,5,1,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
898,31,1,1,47,2,0,1,0,0,0,...,0,1,0,0,1,0,0,1,0,0
2398,19,0,0,42,3,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
5906,55,2,4,29,6,0,0,1,0,0,...,0,0,0,1,0,1,1,0,0,0
2343,67,0,0,49,1,1,0,0,0,0,...,0,0,1,0,1,0,1,0,0,0


In [251]:
y_test.head()

Unnamed: 0,Income
9394,74532
898,65380
2398,67508
5906,67726
2343,67574


In [252]:
x_train = x_train.values
y_train = y_train.values
x_test  = x_test.values
y_test  = y_test.values

In [255]:
x_train

array([[42,  0,  1, ...,  0,  0,  0],
       [19,  1,  0, ...,  0,  1,  0],
       [49,  2,  3, ...,  0,  1,  0],
       ...,
       [54,  1,  4, ...,  0,  1,  0],
       [37,  1,  1, ...,  0,  1,  0],
       [26,  1,  3, ...,  0,  0,  1]])

## Feature Selection

      ▶ Decision Tree Regression is not affected by the multicollinearity so even if there is a non important features it will not

      split from it so it will be ignored

## Model Training

In [181]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [258]:
dtree = DecisionTreeRegressor( random_state = 0 )

In [262]:
param_grid = {
    'max_depth': [ 2, 3 , 4, 5 , 6 , 7 , 8, 10, 12, 14, 16 , None ],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

In [268]:
getBestTree = GridSearchCV( estimator = dtree , param_grid = param_grid , cv = 5 , scoring = 'neg_mean_squared_error' , n_jobs = -1 , verbose = 2 )

getBestTree.fit( x_train , y_train )

print("GridSearchCV completed successfully and best hyperparameters have been identified.")
print("Best Found Max Depth From GridSearchCV Is:", getBestTree.best_params_)
print("Best Average CV score:", -1 * getBestTree.best_score_ , '\n' )


Fitting 5 folds for each of 864 candidates, totalling 4320 fits
GridSearchCV completed successfully and best hyperparameters have been identified.
Best Found Max Depth From GridSearchCV Is: {'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 2}
Best Average CV score: 3276736359897.236 



In [265]:
regressor =  getBestTree.best_estimator_                # our final best trained model

In [269]:
predictedIncome = regressor.predict( x_test )

### Model Evaluation

In [273]:
from sklearn.metrics import mean_squared_error , r2_score

### Evaluate Training results

In [281]:
predicted_training_income = regressor.predict( x_train )

training_data_results = pd.DataFrame(x_train, columns=x.columns)
training_data_results['True_Income'] = y_train
training_data_results['Predicted_Income'] =  np.round( predicted_training_income )

training_data_results.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,Occupation_Technology,...,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking,True_Income,Predicted_Income
0,42,0,1,10,2,0,0,0,1,0,...,0,0,0,1,1,0,0,0,71692,980007.0
1,19,1,0,21,2,0,0,1,0,0,...,0,0,0,1,0,0,1,0,279059,1078697.0
2,49,2,3,10,4,0,0,0,1,0,...,0,1,0,1,0,0,1,0,280479,753307.0
3,68,2,4,5,5,0,0,0,1,0,...,1,0,0,1,1,0,0,0,2280108,753307.0
4,39,1,4,37,6,0,0,0,1,0,...,0,0,1,0,1,0,0,0,2771585,1165311.0


In [283]:
training_mse = mean_squared_error( y_train , predicted_training_income )
training_r2 = r2_score( y_train , predicted_training_income )

print(f"Training Mean Squared Error: {training_mse:.2f}")
print(f"Training R-squared: {training_r2*100:.2f}")

Training Mean Squared Error: 3161872442014.46
Training R-squared: 5.46


### Evaluating the test results

In [282]:
test_data_results = pd.DataFrame(x_test, columns=x.columns)
test_data_results['True_Income'] = y_test
test_data_results['Predicted_Income'] = np.round( predictedIncome )

test_data_results.head()

Unnamed: 0,Age,Education_Level,Number_of_Dependents,Work_Experience,Household_Size,Occupation_Education,Occupation_Finance,Occupation_Healthcare,Occupation_Others,Occupation_Technology,...,Type_of_Housing_Single-family home,Type_of_Housing_Townhouse,Gender_Female,Gender_Male,Primary_Mode_of_Transportation_Biking,Primary_Mode_of_Transportation_Car,Primary_Mode_of_Transportation_Public transit,Primary_Mode_of_Transportation_Walking,True_Income,Predicted_Income
0,69,0,3,34,5,1,0,0,0,0,...,0,1,0,1,0,1,0,0,74532,386100.0
1,31,1,1,47,2,0,1,0,0,0,...,0,0,1,0,0,1,0,0,65380,882102.0
2,19,0,0,42,3,0,0,1,0,0,...,0,0,0,1,0,1,0,0,67508,1165311.0
3,55,2,4,29,6,0,0,1,0,0,...,0,1,0,1,1,0,0,0,67726,435386.0
4,67,0,0,49,1,1,0,0,0,0,...,1,0,1,0,1,0,0,0,67574,386100.0


In [None]:
testingMse = mean_squared_error( y_test , predictedIncome )
testingR2  = r2_score( y_test , predictedIncome )

print(f"Testing Mean Squared Error: {testingMse:.2f}")
print(f"Testing R-squared: {testingR2*100:.2f}%")

Testing Mean Squared Error: 3147598795689.65
Testing R-squared: 1.68
