In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset
df = pd.read_csv('incomvseducation.csv')

In [3]:
print(df.columns)

Index(['_id', 'YEAR', 'Geography', 'Type of work', 'Wages', 'Education level',
       'Age group', 'Both Sexes', 'Male', 'Female'],
      dtype='object')


In [4]:
print(df.shape)

(425040, 10)


In [5]:
# Explore the data
print(df.head())

   _id  YEAR Geography              Type of work            Wages  \
0    1  1997    Canada  Both full- and part-time  Total employees   
1    2  1997    Canada  Both full- and part-time  Total employees   
2    3  1997    Canada  Both full- and part-time  Total employees   
3    4  1997    Canada  Both full- and part-time  Total employees   
4    5  1997    Canada  Both full- and part-time  Total employees   

               Education level           Age group  Both Sexes    Male  Female  
0  Total, all education levels  15 years and over      11364.5  5954.5  5410.0  
1  Total, all education levels         15-24 years      1877.8   983.1   894.7  
2  Total, all education levels         20-34 years      4274.9  2244.3  2030.6  
3  Total, all education levels   25 years and over      9486.7  4971.4  4515.3  
4  Total, all education levels         25-34 years      3047.9  1602.2  1445.7  


In [6]:
# Explore the data
print(df.tail())

           _id  YEAR         Geography   Type of work  \
425035  425036  2019  British Columbia     Part-time    
425036  425037  2019  British Columbia     Part-time    
425037  425038  2019  British Columbia     Part-time    
425038  425039  2019  British Columbia     Part-time    
425039  425040  2019  British Columbia     Part-time    

                          Wages      Education level          Age group  \
425035  Median weekly wage rate  No PSE  (0,1,2,3,4)  25 years and over   
425036  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-34 years   
425037  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-54 years   
425038  Median weekly wage rate  No PSE  (0,1,2,3,4)        25-64 years   
425039  Median weekly wage rate  No PSE  (0,1,2,3,4)  55 years and over   

        Both Sexes   Male  Female  
425035      331.25  320.0   336.0  
425036      308.00  300.0   316.8  
425037      346.25  320.0   360.0  
425038      346.20  336.0   348.3  
425039      300.00  324.0 

In [7]:
print(df.describe())

                 _id           YEAR     Both Sexes           Male  \
count  425040.000000  425040.000000  425040.000000  425040.000000   
mean   212520.500000    2008.000000     281.968049     271.540792   
std    122698.623546       6.633257     480.788912     415.794503   
min         1.000000    1997.000000       0.000000       0.000000   
25%    106260.750000    2002.000000      14.960000      11.780000   
50%    212520.500000    2008.000000      28.850000      24.425000   
75%    318780.250000    2014.000000     482.790000     488.922500   
max    425040.000000    2019.000000   16153.000000    8166.900000   

              Female  
count  425040.000000  
mean      224.783368  
std       335.897431  
min         0.000000  
25%        12.200000  
50%        23.630000  
75%       391.785000  
max      7986.100000  


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425040 entries, 0 to 425039
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   _id              425040 non-null  int64  
 1   YEAR             425040 non-null  int64  
 2   Geography        425040 non-null  object 
 3   Type of work     425040 non-null  object 
 4   Wages            425040 non-null  object 
 5   Education level  425040 non-null  object 
 6   Age group        425040 non-null  object 
 7   Both Sexes       425040 non-null  float64
 8   Male             425040 non-null  float64
 9   Female           425040 non-null  float64
dtypes: float64(3), int64(2), object(5)
memory usage: 32.4+ MB
None


In [9]:
# Count duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

# Print the count of duplicate rows
print("Duplicate Rows:", duplicate_count)

# Drop duplicates
df.drop_duplicates(inplace=True)

Duplicate Rows: 0


In [10]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
df.dropna(inplace=True)


Missing Values:
_id                0
YEAR               0
Geography          0
Type of work       0
Wages              0
Education level    0
Age group          0
Both Sexes         0
Male               0
Female             0
dtype: int64


In [11]:
# Save the cleaned dataset to a new CSV file
df.to_csv('Cleaned_incomvseducation.csv', index=False)

In [12]:
# Summary report
print("\nSummary Report:")
print("Original dataset size:", len(pd.read_csv('incomvseducation.csv')))
print("Cleaned dataset size:", len(df))


Summary Report:
Original dataset size: 425040
Cleaned dataset size: 425040


In [13]:
# Encode categorical variables
encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

In [14]:
# Create predictors and target variables
X = df[['YEAR', 'Geography', 'Type of work', 'Education level', 'Age group', 'Both Sexes', 'Male', 'Female']]
y = df['Wages']

In [15]:
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train Logistic Regression model

In [26]:
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)
log_reg_y_pred = log_reg_model.predict(X_test)
log_reg_accuracy = accuracy_score(y_test, log_reg_y_pred)
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")

Logistic Regression Accuracy: 0.5005411255411255


In [None]:
# K-Fold Cross-Validation for Logistic Regression
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_scores = cross_val_score(log_reg_model, X_train, y_train, cv=10, scoring='accuracy')
log_reg_mean_accuracy = log_reg_scores.mean()
print("Logistic Regression Cross-Validation Mean Accuracy:", log_reg_mean_accuracy)

# Train K-Nearest Neighbors (KNN) model

In [27]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
print(f"K-Nearest Neighbors (KNN) Accuracy: {knn_accuracy}")

K-Nearest Neighbors (KNN) Accuracy: 0.3147233201581028


In [None]:
# K-Fold Cross-Validation for K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_scores = cross_val_score(knn_model, X_train, y_train, cv=10, scoring='accuracy')
knn_mean_accuracy = knn_scores.mean()
print("K-Nearest Neighbors (KNN) Cross-Validation Mean Accuracy:", knn_mean_accuracy)

# Model Comparison

In [28]:
print("Model Comparison for Accuracy")
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print(f"K-Nearest Neighbors (KNN) Accuracy: {knn_accuracy}")

Model Comparison for Accuracy
Logistic Regression Accuracy: 0.5005411255411255
K-Nearest Neighbors (KNN) Accuracy: 0.3147233201581028


From these results, it's evident that the Logistic Regression model outperforms the K-Nearest Neighbors (KNN) model in terms of accuracy for this particular classification task. 
The Logistic Regression model achieves an accuracy of around 50.05%, while the KNN model achieves an accuracy of approximately 31.47%.

In [None]:
print("Model Comparison (10-Fold Cross-Validation):")
print(f"Logistic Regression Cross-Validation Mean Accuracy {linear_reg_rmse_scores.mean()}")
print(f"K-Nearest Neighbors (KNN) Cross-Validation Mean Accuracy: {tree_reg_rmse_scores.mean()}")

In [29]:
# Create a new set of records
new_records = pd.DataFrame({
    'YEAR': [1997, 2000, 2019],
    'Geography': ['Canada', 'British Columbia', 'Canada'],
    'Type of work': ['Both full- and part-time', 'Both full- and part-time', 'Both full- and part-time'],
    'Education level': ['Total, all education levels', 'No PSE  (0,1,2,3,4)', 'No PSE  (0,1,2,3,4)'],
    'Age group': ['25 years and over', '15-24 years', '25-54 years'],
    'Both Sexes': [2500, 1800, 3200],
    'Male': [1300, 900, 1600],
    'Female': [1200, 900, 1600]
})

In [30]:
# Encode categorical variables in new records using the same label encoder as for training data
for col in new_records.select_dtypes(include=['object']).columns:
    if col in encoders:
        new_records[col] = encoders[col].transform(new_records[col])

In [31]:
# Standardize numerical features in new records
new_records_scaled = scaler.transform(new_records)

In [32]:
# Make predictions using the Logistics Regression model
logistics_reg_predictions = log_reg_model.predict(new_records_scaled)

In [33]:
# Print the predictions
print("Logistics Regression Predictions:", logistics_reg_predictions)

Logistics Regression Predictions: [4 4 4]


In [34]:
# Make predictions using the K-Nearest Neighbors model
knn_predictions = knn_model.predict(new_records_scaled)

In [35]:
print("K-Nearest Neighbors model Predictions:", knn_predictions)

K-Nearest Neighbors model Predictions: [4 4 4]


Both Logistic Regression and KNN models predicted the same label for all three new records, indicating consistency in their predictions.