In [1]:
# Import necessary libraries
import pandas as pd

# Load the dataset
ds_jobs = pd.read_csv("customer_train.csv")

# View the dataset
ds_jobs.head()

  from pandas.core import (


Unnamed: 0,student_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,job_change
0,8949,city_103,0.92,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevant experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevant experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevant experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [2]:
# Create a copy of ds_jobs for transforming
ds_jobs_transformed = ds_jobs.copy()

In [3]:
#Handling categories with 2 factors

##Identifying columns with just 2 factors
print(ds_jobs_transformed['job_change'].unique())
print(ds_jobs_transformed['relevant_experience'].unique())

##Converting relevant_experience into a boolean column
ds_jobs_transformed['relevant_experience'] = ds_jobs_transformed['relevant_experience'].apply(lambda x: x=='Has relevant experience')
## Converting job_change into a boolean column
ds_jobs_transformed['job_change'] = ds_jobs_transformed['job_change'].apply(lambda x: x==1)


[1. 0.]
['Has relevant experience' 'No relevant experience']


In [4]:
#Converting intgers to 32bit integers

ds_jobs_transformed[['student_id', 'training_hours']] = ds_jobs_transformed[['student_id', 'training_hours']].astype('int32')

#Converting floats to 16bit floats

ds_jobs_transformed[['city_development_index']] = ds_jobs_transformed[['city_development_index']].astype('float16')

# Converting Nominal categorical variables as "category" type

ds_jobs_transformed[['city', 'gender', 'major_discipline', 'company_type']] = ds_jobs_transformed[['city', 'gender', 'major_discipline', 'company_type']].astype('category')


In [5]:
#Creating ordered categories
ordered_cats = {
    'enrolled_university': ['no_enrollment', 'Part time course', 'Full time course'],
    'education_level': ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd'],
    'experience': ['<1'] + list(map(str, range(1, 21))) + ['>20'],
    'company_size': ['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999', '10000+'],
    'last_new_job': ['never', '1', '2', '3', '4', '>4']
}

for col in ds_jobs_transformed:
    if col in ordered_cats.keys():
        category = pd.CategoricalDtype(ordered_cats[col], ordered=True)
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype(category)


In [6]:
##Comparing efficiencies between the two datasets

memory_usage_prev = ds_jobs.memory_usage(deep=True).sum() / (1024 * 1024)  # Convert bytes to megabytes
memory_usage_new = ds_jobs_transformed.memory_usage(deep=True).sum() / (1024 * 1024)  # Convert bytes to megabytes

print(f"Memory usage of previous DF: {memory_usage_prev} MB")
print(f"Memory usage of new DF: {memory_usage_new} MB")

memory_reduction_percent = ((memory_usage_prev - memory_usage_new) / memory_usage_prev) * 100
print(f"Memory reduction: {memory_reduction_percent:.2f}%")

Memory usage of previous DF: 11.817654609680176 MB
Memory usage of new DF: 0.4005136489868164 MB
Memory reduction: 96.61%


In [7]:
## Processing DS JOBS TRANSFORMED FOR TRAINING
#Functions to streamline Preprocessing

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import time


def process_ordinal(df):
    le = LabelEncoder()

    for col in df.columns:
        if col in ordered_cats:
            df[col] = le.fit_transform(df[col])

def process_nominal(df):
    columns_to_encode = ['gender', 'major_discipline', 'company_type']
    for col in columns_to_encode:
        dummy = pd.get_dummies(df[col], prefix=col)
        df = pd.concat([df, dummy], axis=1).drop(col, axis=1)
    return df

def prep_for_training(df):
    result = df.copy(deep=True)

    result.dropna(inplace=True)

    process_ordinal(result)
    result = process_nominal(result)

    X = result.drop(['job_change', 'student_id', 'city'], axis=1)
    y = result['job_change']

    return X, y

def get_train_time(X1,y1,X2,y2, models):
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

    start_time = time.time()
    for model_name, model in models.items():
        model.fit(X1_train, y1_train)
    end_time = time.time()
    training_time_old = end_time - start_time
    
    
    start_time = time.time()
    for model_name, model in models.items():
        model.fit(X2_train, y2_train)
    end_time = time.time()
    training_time_efficient = end_time - start_time

    print(f"Training time for df1: {training_time_old} seconds")
    print(f"Training time for df2: {training_time_efficient} seconds")

models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Gradient Boost': GradientBoostingClassifier()
}


start_time = time.time()
X_efficient, y_efficient = prep_for_training(ds_jobs_transformed)
end_time = time.time()
diff_efficient = end_time - start_time


start_time = time.time()
X_old, y_old = prep_for_training(ds_jobs)
end_time= time.time()

diff_old = end_time - start_time

print("Time efficient: " + str(diff_efficient))
print("Time old: " + str(diff_old))

pct_diff = (diff_old - diff_efficient)/(diff_old) * 100

print("Improvement: " + str(pct_diff)+" %")



Time efficient: 0.007634162902832031
Time old: 0.011611223220825195
Improvement: 34.25186341142892 %
