## Function to Clean Locations column

In [1]:
import pandas as pd
import re

def clean_domain_column(column_data):
    def clean_major(major):
        if pd.isna(major) or major.strip() == "Na":
            return None  # Remove Na or empty values
        
        major = major.lower().strip()
        
        # Standardize common terms
        replacements = {
            r'computer science|bsc cs|bsc comp sci|bsc computer science|bsc in computer science': 'BSc CS',
            r'btech.*computer science|engineering|btech.*cse|b\.tech.*cse': 'BTech CS',
            r'information technology|it': 'IT',
            r'mechanical engineering|mechanical|be mechanical': 'Mechanical Engineering',
            r'civil engineering|civil': 'Civil Engineering',
            r'commerce|bcom|b\.com|b com': 'Commerce',
            r'business administration|bba': 'BBA',
            r'mba.*': 'MBA',
            r'law': 'Law',
            r'biology|bsc life science|life sciences': 'Biology',
            r'psychology': 'Psychology',
            r'english literature|english': 'English Literature',
            r'history': 'History',
            r'mass media|journalism|communication': 'Mass Communication & Journalism',
            r'physics': 'Physics',
            r'chemistry': 'Chemistry',
            r'marketing': 'Marketing',
            r'finance': 'Finance',
            r'sociology': 'Sociology',
            r'nursing': 'Nursing',
        }
        
        for pattern, replacement in replacements.items():
            if re.search(pattern, major):
                return replacement
        
        return major.title()  # Capitalize first letter of each word
    
    return column_data.apply(clean_major)


## Function to clean Score column

In [2]:
import pandas as pd
import re

def format_to_cgpa(column_data):

    def convert(value):
        if pd.isna(value):
            return None
        
        value = str(value).lower().strip()

        # Extract the number
        number = re.search(r'\d+(\.\d+)?', value.replace('[, %]', ''))
        if not number:
            return None

        # Normalize to CGPA
        num = round(float(number.group(0)), 2)
        if num > 10:
            num = round(num / 10, 2)
        
        return num if 0 <= num <= 10 else None
    
    # Mean valus for missing data
    cleaned_data = column_data.apply(convert)
    mean = round(cleaned_data.mean(), 2)
    cleaned_data.fillna(mean, inplace=True)

    return cleaned_data


## function to clean Prefered Jobs

In [3]:


def clean_job_titles(column_data):
    def clean_job(title):
        if pd.isna(title) or title.strip().lower() in ['none', 'no', 'not decided', '-', '—', '.', 'nan']:
            return None  # Replace missing or unclear values
        
        title = title.lower().strip()
        
        # Standardize job titles
        replacements = {
            r'ml engineer|data scientist': 'Data Scientist',
            r'software developer|software engineer|developer': 'Software Developer',
            r'full stack developer': 'Full Stack Developer',
            r'web developer': 'Web Developer',
            r'data analyst|data analysis': 'Data Analyst',
            r'mechanical design engineer': 'Mechanical Engineer',
            r'cyber security consultant|information security': 'Cyber Security Engineer',
            r'hr|human resource|hr operations': 'HR',
            r'marketing|marketing executive|digital marketer': 'Marketing',
            r'accountant|finance|financial analyst': 'Finance & Accounting',
            r'teacher|professor|educator': 'Education',
            r'psychologist|counsellor': 'Psychology',
            r'nursing|nurse': 'Healthcare',
            r'civil engineering': 'Civil Engineer',
            r'legal officer|advocate|law': 'Legal',
            r'business analyst': 'Business Analyst',
            r'manager|management|operations executive': 'Management',
            r'artist|designer|vfx': 'Creative & Design',
            r'engineer': 'Engineer'
        }
        
        for pattern, replacement in replacements.items():
            if re.search(pattern, title):
                return replacement
        
        return title.title()
    
    return column_data.apply(clean_job)


## clean expected salary

In [4]:
import re
import numpy as np
import pandas as pd

def convert_to_numeric(salary):
    salary = salary.lower().replace(",", "").strip()
    
    # Handle lakh (L) and crore (Cr)
    salary = salary.replace('lakh', 'l').replace('crore', 'cr')
    
    # Convert crore (cr) to numeric (1 crore = 10 million)
    if 'cr' in salary:
        match = re.search(r'(\d+(?:\.\d+)?)\s*cr', salary)
        if match:
            return float(match.group(1)) * 10000000 / 12  # Convert to monthly
    
    # Convert lakh (L) to numeric (1 lakh = 100,000)
    if 'l' in salary:
        match = re.search(r'(\d+(?:\.\d+)?)\s*l', salary)
        if match:
            return float(match.group(1)) * 100000 / 12  # Convert to monthly
    
    # Convert 'k' notation (e.g., 15k = 15000)
    salary = salary.replace('k', '000')
    
    # Handle ranges (e.g., "15k to 20k", "35-40k")
    match = re.search(r'(\d+)\s*[-to]+\s*(\d+)', salary)
    if match:
        return (int(match.group(1)) + int(match.group(2))) / 2
    
    # Extract single numeric value
    match = re.search(r'\d+', salary)
    if match:
        return int(match.group(0))
    
    return np.nan  # Return NaN for invalid values

def clean_salary_column(df, column_name):
    df[column_name] = df[column_name].astype(str).apply(convert_to_numeric)
    return df[column_name]

### Chi-square Test

In [5]:
import pandas as pd
import scipy.stats as stats
from itertools import combinations

# Function to compute Chi-Square test and return correlated variable pairs
def chi_square_high_correlation(df, categorical_cols, threshold=0.5, top_n=6, get_high_corr=True):
    correlation_pairs = []

    # Iterate through all pairs of categorical columns
    for col, other_col in combinations(categorical_cols, 2):
        contingency_table = pd.crosstab(df[col], df[other_col])
        chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)

        # Store column pair and p-value
        correlation_pairs.append((col, other_col, p_value))

    # Filter based on threshold
    if get_high_corr:
        filtered_pairs = [pair for pair in correlation_pairs if pair[2] < threshold]
    else:
        filtered_pairs = [pair for pair in correlation_pairs if pair[2] >= threshold]

    # Sort by p-value (ascending order)
    filtered_pairs.sort(key=lambda x: x[2])

    # Return only the top N pairs or all available pairs
    return filtered_pairs[:min(top_n, len(filtered_pairs))]


### Encode Categorical Variables

In [6]:

def EncodeCategoricalFeatures(df):
    df_copy = df.copy()
    
    # age distribution encoding
    df_copy['Age'] = df_copy['Age'].replace({'Under 18': 0, '18-22': 1, '23-27': 2, '28-32': 3, 'Above 32': 4})
    
    # Gender encoding
    df_copy['Gender'] = df_copy['Gender'].replace({'Male':0, 'Female':1, 'Bisexual':3})
    
    # status encoding
    df_copy['Status'] = df_copy['Status'].replace({'Currently studying':0, 'Working professional':1, 'Unemployed':2})
    
    # Income column encoding
    df_copy['Income(₹)'] = df_copy['Income(₹)'].replace({'Less than ₹2,50,000':0, '₹2,50,000–₹5,00,000':1, '₹5,00,000–₹10,00,000':2, '₹10,00,000–₹20,00,000':3, 'Above ₹20,00,000':4})
    
    # Encode Qualification
    df_copy['Qualification'] = df_copy['Qualification'].replace({'High school':0, 'Diploma':1, "Bachelor's degree":2, "Master's degree":3, 'Ph. D.':4})
    
    # Encode Internship
    df_copy['Internship'] = df_copy['Internship'].replace({'yes': 1, 'No': 0})
    
    # Encode Job Confidence
    df_copy['Job-confidence'] = df_copy['Job-confidence'].replace({'Very confident': 4, 'Confident': 3, 'Neutral': 2, 'Not confident': 1, 'Very unconfident': 0})
        
    # encode Market-awareness
    df_copy['Market-awareness'] = df_copy['Market-awareness'].replace({'No':0, 'Somewhat':1, 'Yes':2})
    
    # Encode Automation-impact
    df_copy['Automation-impact'] = df_copy['Automation-impact'].replace({'Positively':3, 'Not sure':2, 'No impact':1, 'Negatively':0})
    
    # Encode Work-mode
    df_copy['Work-mode'] = df_copy['Work-mode'].replace({'Remote':0, 'Hybrid':1, 'On-site':2})

    # encode Job-search-time-weekly
    df_copy['Job-search-time-weekly'] = df_copy['Job-search-time-weekly'].replace({'Less than 1 hour':0, '1–5 hours':1, '5–10 hours':2, 'More than 10 hours':3})
    
    # encode Motivation
    df_copy['Motivation'] = df_copy['Motivation'].replace({'Work-life balance':0, 'Job security':1, 'Growth potential':2, 'High salary':3, 'Learning opportunities':4})
    
    # encode Switch-domain
    df_copy['Switch-domain'] = df_copy['Switch-domain'].replace({'No':0,'Maybe':1, 'Yes':2})
    
    # encode Relocate
    df_copy['Relocate'] = df_copy['Relocate'].replace({'No':0, 'Yes, Within my country':1, 'Yes, anywhere':2})
    
    # Curriculum-use, already in numerical form
    df_copy['Curriculum-use'] = df_copy['Curriculum-use'].replace({1:0, 2:1, 3:2, 4:3, 5:4})
    
    # Job-security, already in numerical form
    df_copy['Job-security'] = df_copy['Job-security'].replace({1:0, 2:1, 3:2, 4:3, 5:4})
    
    # Encode Skill-set-Scale,
    df_copy['Skill-set-scale'] = df_copy['Skill-set-scale'].replace({'No':0, 'Maybe':1, 'Yes':2})
    
    # Encode Updated-LinkedIn
    df_copy['Updated-LinkedIn'] = df_copy['Updated-LinkedIn'].replace({'No':0, 'yes':1})
    
    # Encode Certifications
    df_copy['Certification'] = df_copy['Certification'].replace({'No':0, 'Yes':1})
    
    # Encode Job-platform-freq
    df_copy['Job-platforms-freq'] = df_copy['Job-platforms-freq'].replace({'Never':0, 'Rarely':1, 'Monthly':2, 'Weekly':3, 'Daily':4}) 

    return df_copy

In [7]:
from sklearn.preprocessing import OrdinalEncoder

def encode_with_ordinal_encoder(df, ordinal_cols):
    
    df_copy = df.copy()

    encoder = OrdinalEncoder()
    df_copy[ordinal_cols] = encoder.fit_transform(df_copy[ordinal_cols])
    
    # Convert to int (OrdinalEncoder returns floats)
    df_copy[ordinal_cols] = df_copy[ordinal_cols].astype(int)
    
    # Clean Location
    df_copy['Location'] = df_copy['Location'].str.replace('[,.]', '', regex=True).str.lower().str.strip()

    return df_copy


In [8]:
# Cleaning function
def clean_transactions(transactions):
    replacements = {
        r'accounts': 'accounting',
        r'fine artist': 'artist',
        r'govt services': 'government services',
        r'hr|human resource': 'human resources',
        r'it/software': 'it',
        r'legal sector': 'legal',
        r'pharma|pharmaceutical': 'pharmaceuticals',
        r'social work': 'social services',
        r'biotechnology ': 'biotechnology',
    }
    
    cleaned = []
    for sublist in transactions:
        cleaned_sublist = []
        for item in sublist:
            cleaned_item = item.strip().lower()
            for pattern, replacement in replacements.items():
                if pd.Series(cleaned_item).str.fullmatch(pattern, case=False, na=False).any():
                    cleaned_item = replacement
                    break
            if cleaned_item:  # Ignore empty strings
                cleaned_sublist.append(cleaned_item)
        cleaned.append(cleaned_sublist)
    
    return cleaned