In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

from sklearn.model_selection import cross_val_score

In [3]:
# Load the dataset
df = pd.read_csv('incomvseducation.csv')

In [4]:
print(df.columns)

Index(['_id', 'YEAR', 'Geography', 'Type of work', 'Wages', 'Education level',
       'Age group', 'Both Sexes', 'Male', 'Female'],
      dtype='object')


In [5]:
print(df.shape)

(425040, 10)


In [6]:
# Explore the data
print(df.head())

   _id  YEAR Geography              Type of work            Wages  \
0    1  1997    Canada  Both full- and part-time  Total employees   
1    2  1997    Canada  Both full- and part-time  Total employees   
2    3  1997    Canada  Both full- and part-time  Total employees   
3    4  1997    Canada  Both full- and part-time  Total employees   
4    5  1997    Canada  Both full- and part-time  Total employees   

               Education level           Age group  Both Sexes    Male  Female  
0  Total, all education levels  15 years and over      11364.5  5954.5  5410.0  
1  Total, all education levels         15-24 years      1877.8   983.1   894.7  
2  Total, all education levels         20-34 years      4274.9  2244.3  2030.6  
3  Total, all education levels   25 years and over      9486.7  4971.4  4515.3  
4  Total, all education levels         25-34 years      3047.9  1602.2  1445.7  


In [7]:
# Explore the data
print(df.tail())

           _id  YEAR         Geography   Type of work  \
425035  425036  2019  British Columbia     Part-time    
425036  425037  2019  British Columbia     Part-time    
425037  425038  2019  British Columbia     Part-time    
425038  425039  2019  British Columbia     Part-time    
425039  425040  2019  British Columbia     Part-time    

                          Wages      Education level          Age group  \
425035  Median weekly wage rate  No PSE  (0,1,2,3,4)  25 years and over   
425036  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-34 years   
425037  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-54 years   
425038  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-64 years   
425039  Median weekly wage rate  No PSE  (0,1,2,3,4)  55 years and over   

        Both Sexes   Male  Female  
425035      331.25  320.0   336.0  
425036      308.00  300.0   316.8  
425037      346.25  320.0   360.0  
425038      346.20  336.0   348.3  
425039      300.00  324.0 

In [8]:
print(df.describe())

                 _id           YEAR     Both Sexes           Male  \
count  425040.000000  425040.000000  425040.000000  425040.000000   
mean   212520.500000    2008.000000     281.968049     271.540792   
std    122698.623546       6.633257     480.788912     415.794503   
min         1.000000    1997.000000       0.000000       0.000000   
25%    106260.750000    2002.000000      14.960000      11.780000   
50%    212520.500000    2008.000000      28.850000      24.425000   
75%    318780.250000    2014.000000     482.790000     488.922500   
max    425040.000000    2019.000000   16153.000000    8166.900000   

              Female  
count  425040.000000  
mean      224.783368  
std       335.897431  
min         0.000000  
25%        12.200000  
50%        23.630000  
75%       391.785000  
max      7986.100000  


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425040 entries, 0 to 425039
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   _id              425040 non-null  int64  
 1   YEAR             425040 non-null  int64  
 2   Geography        425040 non-null  object 
 3   Type of work     425040 non-null  object 
 4   Wages            425040 non-null  object 
 5   Education level  425040 non-null  object 
 6   Age group        425040 non-null  object 
 7   Both Sexes       425040 non-null  float64
 8   Male             425040 non-null  float64
 9   Female           425040 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 32.4+ MB
None


In [10]:
# Count duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

# Print the count of duplicate rows
print("Duplicate Rows:", duplicate_count)

# Drop duplicates
df.drop_duplicates(inplace=True)

Duplicate Rows: 0


In [11]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
df.dropna(inplace=True)


Missing Values:
_id                0
YEAR               0
Geography          0
Type of work       0
Wages              0
Education level    0
Age group          0
Both Sexes         0
Male               0
Female             0
dtype: int64


In [12]:
# Encode categorical variables
encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

In [13]:
# Create a new set of predictors
X = df[['YEAR', 'Geography', 'Type of work', 'Education level', 'Age group', 'Both Sexes', 'Male', 'Female']]
y = df['Wages']

In [14]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [36]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Linear Regression model

In [37]:
# Model 1: Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
linear_y_pred = linear_reg_model.predict(X_test)
linear_r2 = r2_score(y_test, linear_y_pred)
print(f"Linear Regression R-squared: {linear_r2}")

Linear Regression R-squared: 0.027807336966215956


In [20]:
# K-Fold Cross-Validation
# K = 10
linear_reg_model = LinearRegression()
linear_reg_scores = cross_val_score(linear_reg_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
linear_reg_rmse_scores = (-linear_reg_scores)**0.5
print("Linear Regression RMSE Scores:")
print(linear_reg_rmse_scores)
print(f"Average RMSE: {linear_reg_rmse_scores.mean()}")

Linear Regression RMSE Scores:
[1.38989528 1.39707074 1.39844314 1.39685623 1.40101558 1.39529841
 1.38978335 1.39970537 1.39220097 1.39460189]
Average RMSE: 1.3954870961601642


# Train Decision Tree Regression model

In [21]:
tree_reg_model = DecisionTreeRegressor(random_state=42)
tree_reg_model.fit(X_train, y_train)
tree_y_pred = tree_reg_model.predict(X_test)
tree_r2 = r2_score(y_test, tree_y_pred)
print(f"Decision Tree Regression R-squared: {tree_r2}")

Decision Tree Regression R-squared: 0.6365067737816661


In [17]:
# K-Fold Cross-Validation
# K = 10
tree_reg_model = DecisionTreeRegressor(random_state=42)
tree_reg_scores = cross_val_score(tree_reg_model, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
tree_reg_rmse_scores = (-tree_reg_scores)**0.5
print("\nDecision Tree Regression RMSE Scores:")
print(tree_reg_rmse_scores)
print(f"Average RMSE: {tree_reg_rmse_scores.mean()}")


Decision Tree Regression RMSE Scores:
[0.85728957 0.86926538 0.8643623  0.87834337 0.87543592 0.87001322
 0.86087702 0.86689057 0.88122651 0.87757107]
Average RMSE: 0.8701274919374402


In [22]:
print("Model Comparison for R-squared:")
print(f"Linear Regression R-squared: {linear_r2}")
print(f"Decision Tree Regression R-squared: {tree_r2}")

Model Comparison:
Linear Regression R-squared: 0.027807336966215956
Decision Tree Regression R-squared: 0.6365067737816661


In [23]:
print("Model Comparison (10-Fold Cross-Validation):")
print(f"Linear Regression (10-Fold Cross-Validation): {linear_reg_rmse_scores.mean()}")
print(f"Decision Tree Regression (10-Fold Cross-Validation): {tree_reg_rmse_scores.mean()}")

Model Comparison (10-Fold Cross-Validation):
Linear Regression (10-Fold Cross-Validation): 1.3954870961601642
Decision Tree Regression (10-Fold Cross-Validation): 0.8701274919374402


The results show that the Decision Tree Regression model outperforms Linear Regression in terms of R-squared and RMSE scores. 
The Decision Tree Regression model has a higher R-squared value (0.6365) compared to Linear Regression (0.0278), indicating that it explains more variance in the target variable (Wages). 
Additionally, the Decision Tree Regression model has a lower average RMSE (0.8701) compared to Linear Regression (1.3955) during 10-fold cross-validation, indicating better predictive accuracy.

The Decision Tree Regression model is likely performing better because it can capture non-linear relationships between predictors and the target variable, whereas Linear Regression assumes a linear relationship.

However, Decision Tree models can also be prone to overfitting, so it's essential to consider the trade-offs between model complexity and performance.

In [32]:
# Create a new set of records
new_records = pd.DataFrame({
    'YEAR': [1997, 2000, 2019],
    'Geography': ['Canada', 'British Columbia', 'Canada'],
    'Type of work': ['Both full- and part-time', 'Both full- and part-time', 'Both full- and part-time'],
    'Education level': ['Total, all education levels', 'No PSE  (0,1,2,3,4)', 'No PSE  (0,1,2,3,4)'],
    'Age group': ['25 years and over', '15-24 years', '25-54 years'],
    'Both Sexes': [2500, 1800, 3200],
    'Male': [1300, 900, 1600],
    'Female': [1200, 900, 1600]
})

In [33]:
# Encode categorical variables in new records using the same label encoder as for training data
for col in new_records.select_dtypes(include=['object']).columns:
    if col in encoders:
        new_records[col] = encoders[col].transform(new_records[col])

In [34]:
# Standardize numerical features in new records
new_records_scaled = scaler.transform(new_records)

In [38]:
# Make predictions using the Linear Regression model
linear_reg_predictions = linear_reg_model.predict(new_records_scaled)

In [39]:
# Print the predictions
print("Linear Regression Predictions:", linear_reg_predictions)

Linear Regression Predictions: [3.44658169 3.04010716 3.8829657 ]


In [40]:
# Make predictions using the Decision Tree Regression model
tree_reg_predictions = tree_reg_model.predict(new_records_scaled)

In [41]:
print("Decision Tree Regression Predictions:", tree_reg_predictions)

Decision Tree Regression Predictions: [4. 4. 4.]


# Prediction Comparison
The predictions from the Decision Tree Regression model are all rounded to 4, indicating a consistent prediction for all new records.
The predictions from the Linear Regression model are more varied, with different values for each new record.