In [10]:
import pandas as pd
import json
import re
import numpy as np
# Load Excel and prepare data
employee_df = pd.read_excel("Employeee Details.xlsx")
employee_df["Official Email"].fillna(employee_df["Personal Email"], inplace=True)
employee_df["Official Email"] = employee_df["Official Email"].str.replace(" ", "", regex=False)
employee_df = employee_df.iloc[:-1]

# Clean 'Organization Unit' column safely
def clean_unit(unit):
    if isinstance(unit, str):
        if "department of" in unit.lower():
            unit = re.sub(r"(?i)department\s+of\s+", "", unit)
        elif "of" in unit.lower():
            unit = re.sub(r".*of\s+", "", unit)
    return unit.strip() if isinstance(unit, str) else unit

employee_df["Organization Unit"] = employee_df["Organization Unit"].apply(clean_unit)

# Drop unnecessary columns
employee_df = employee_df.drop(columns=['Personal Email', 'Nature of Employment'], errors='ignore')

# Email cleaning
def clean_email(email):
    if not isinstance(email, str):
        return "ERROR: Not a string"
    email = email.strip().lower()
    return email if "@" in email else "ERROR: Invalid email"

employee_df['Official Email'] = employee_df['Official Email'].apply(clean_email)

# Load configuration
with open('/Users/rajesmanna/Documents/Phd/backend/conf.json', 'r') as file:
    data = json.load(file)

departments = data.get("college", {}).get("departments", {})
units = data.get("college", {}).get("administrative_units", {})
departments = {**departments, **units}
designation = data.get("designation", {})

# Map 'Post' to designation keys
def map_designation(post):
    for key, value in designation.items():
        if str(post).strip().lower() == str(value).strip().lower():
            return key
    return "OTHERS"

employee_df['Post'] = employee_df['Post'].apply(map_designation)

# Map 'Organization Unit' to department keys
def map_department(unit):
    if pd.isna(unit):
        return "ERROR: Missing unit"
    for key, value in departments.items():
        if str(unit).strip().lower() == str(value).strip().lower():
            return key
    return f"ERROR: {unit} is not a valid department"

employee_df['Organization Unit'] = employee_df['Organization Unit'].apply(map_department)

# Clean 'Mobile Number' to ensure 10 digits
def clean_mobile(mobile):
    digits = re.sub(r'\D', '', str(mobile))
    return digits if len(digits) == 10 else ""

employee_df['Mobile Number'] = employee_df['Mobile Number'].apply(clean_mobile)

# Format DOB to YYYY-MM-DD
def format_dob(dob):
    try:
        return pd.to_datetime(dob).strftime("%Y-%m-%d")
    except:
        return ""

employee_df['Date of Birth'] = employee_df['Date of Birth'].apply(format_dob)

# Clean up name spacing
def clean_name(name):
    if not isinstance(name, str):
        return name
    return re.sub(' +', ' ', name).strip()

employee_df['Name'] = employee_df['Name'].apply(clean_name)
employee_df['Type of Employee'] = np.where(employee_df['Type of Employee'] == 'Teaching', 'T', 'NT')
employee_df['Type of Employee'] = np.where(employee_df['Type of Employee'] == 'Teaching', 'T', 'NT')


# Save to new Excel file
employee_df.to_excel("employee.xlsx", index=False)
# employee_df.head()

#statistical analysis
# print("Official Email:")
# print(f"  Total entries: {employee_df['Official Email'].shape[0]}")
# print(f"  Unique: {employee_df['Official Email'].nunique()}")
# print(f"  Duplicates: {employee_df.duplicated('Official Email').sum()}")

# duplicate_emails = employee_df[employee_df.duplicated('Official Email', keep=False)]
# print("  Duplicate Email Values:")
# for index, email in duplicate_emails[['Official Email']].iterrows():
#     print(f"  {index} - {email['Official Email']}")

# print("\nName:")
# print(f"  Total entries: {employee_df['Name'].shape[0]}")
# print(f"  Unique: {employee_df['Name'].nunique()}")
# print(f"  Duplicates: {employee_df.duplicated('Name').sum()}")

# duplicate_names = employee_df[employee_df.duplicated('Name', keep=False)]
# print("  Duplicate Name Values:")
# for index, name in duplicate_names[['Name']].iterrows():
#     print(f"  {index} - {name['Name']}")








FileNotFoundError: [Errno 2] No such file or directory: 'Employeee Details.xlsx'

# New Employee Data

In [13]:
# --- Initialization Cell ---

import pandas as pd
import numpy as np
import logging

# Setup logging
log_file = "employee.log"
logging.basicConfig(filename=log_file, level=logging.INFO, filemode='w', force=True)

# Load Excel
df = pd.read_excel("/Users/rajesmanna/Documents/Phd/backend/srcipts/employee details ss.xlsx")

df["Official Email"].fillna(df["Personal Email"], inplace=True)
df["Official Email"] = df["Official Email"].str.replace(" ", "", regex=False)
df = df.iloc[:-1]
# Drop unused columns if they exist

# --- Clean column names ---
# Remove leading/trailing/double spaces
df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)
df.drop(columns=['Personal Email'], inplace=True, errors='ignore')

df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Official Email"].fillna(df["Personal Email"], inplace=True)


Unnamed: 0,Salutation,Name,Gender,Date of Birth,Mobile Number,Type of Employee,Nature of Employment,Organization Unit,Post,Date of Joining,Official Email
0,Mr.,Muzaffar,Male,1965-05-01,9541047030,Non-Teaching,Permanent,Department of Information Technology,Sr. Technician,1994-04-01,muzaffar123@gmail.com
1,Mr.,Mohammad Ashraf,Male,1972-05-05,7006984839,Non-Teaching,Permanent,Department of Information Technology,Sr. Technician,1991-01-05,ashraf1232@gmail.com
2,Mr.,Manzoor,Male,1968-04-09,9797721521,Non-Teaching,Permanent,Department of Information Technology,Sr. Technician,1991-01-04,manzoor.nit3@gmail.com
3,Ms.,Insha,Female,1993-05-25,7006752609,Non-Teaching,Permanent,ERP,Technical Assistant,2022-12-17,cseinsha@gmail.com
4,Mr.,Kalim Dar,Male,1988-05-16,9906481650,Non-Teaching,Permanent,Department of Information Technology,Technical Assistant,2021-12-13,kaleem@nitsri.ac.in
...,...,...,...,...,...,...,...,...,...,...,...
467,Ms.,,Female,,9103862679,Non-Teaching,Temporary,Examination,Senior Office Attendant,,mirsaimahassan99@gmail.com
468,Dr.,,Female,,3622315735,Teaching,Temporary,Department of Computer Science and Engineering,Assistant professor,,ifrahraof21@gmail.com
469,Dr.,,Female,,7889473021,Teaching,Temporary,Department of Computer Science and Engineering,Assistant professor,,mehkhan27@gmail.com
470,Dr.,,Female,,9541110299,Teaching,Temporary,Department of Computer Science and Engineering,Assistant professor,,sadiahussain.hussain@gmail.com


In [15]:
# --- Logging Cell ---

log = []

# --- Null Entries ---
log.append("\n--- Null Entries (Column-wise) ---")
null_counts = df.isnull().sum()
for col, count in null_counts.items():
    log.append(f"{col}: {count}")

# --- Duplicate Entries ---
log.append("\n--- Duplicate Entries (Column-wise) ---")
for col in df.columns:
    dup_series = df[col][df[col].duplicated(keep=False)]
    if not dup_series.empty:
        log.append(f"\nColumn: {col} - {dup_series.duplicated().sum()} duplicates")
        log.append(f"Duplicate values:\n{dup_series.dropna().unique().tolist()}")

# --- Unique Entries ---
log.append("\n--- Unique Entries (Column-wise) ---")
for col in df.columns:
    uniques = df[col].dropna().unique()
    log.append(f"\nColumn: {col} - {len(uniques)} unique values")
    log.append(f"Values: {uniques[:10]}{' ...' if len(uniques) > 10 else ''}")  # Limit to first 10

# Save logs to file
for entry in log:
    logging.info(entry)

print("Analysis complete. Results saved in 'employee.log'.")


Analysis complete. Results saved in 'employee.log'.


In [14]:

# Normalize whitespace in columns
df['Name'] = df['Name'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
df['Organization Unit'] = df['Organization Unit'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
df['Post'] = df['Post'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()

# Department and administrative units mapping
unit_mapping = {
    # Departments
    'Department of Information Technology': "Information Technology",
    'Department of Computer Science and Engineering': "Computer Science and Engineering",
    'Department  of Chemical Engineering': "Chemical Engineering",
    'Department of Materials and Metallurgical Engineering': "Materials and Metallurgical Engineering",
    'Department of Electrical Engineering': "Electrical Engineering",
    'Department of Electronics and Communication Engineering': "Electronics and Communication Engineering",
    'Department of  Mechanical Engineering': "Mechanical Engineering",
    'Department of Mathematics': "Mathematics",
    'Department of Civil Engineering': "Civil Engineering",
    'Department of Chemistry': "Chemistry",
    'Department of Physics': "Physics",
    'Department of Humanities Social Sciences and Management': "Humanities Social Sciences and Management",

    # Administrative units
    'ERP': "ERP",
    'Planning and Development': "Planning and Development",
    'Personnel Department': "Personnel Department",
    'Administration': "Administration",
    'Accounts': "Accounts",
    'Central Workshop': "Central Workshop",
    'Computer Service Centre': "Computer Service Centre",
    'Central Research Facility Centre': "Central Research Facility Centre",
    'Academic Affairs': "Academic Affairs",
    'Medical Unit': "Medical Unit",
    'Registrar Office': "Registrar Office",
    'Water Resources Centre': "Water Resources Centre",
    'Mess office': "Mess office",
    'Library': "Library",
    'legal cell': "Legal Cell",
    'Examination': "Examination",
    'Electrical Maintenance Division': "Electrical Maintenance Division",
    'Hostel Office': "Hostel Office",
    'Sports and Student activities': "Sports and Student Activities",
    'Central Purchase Unit': "Central Purchase Unit",
    'Security': "Security",
    'University Campus': "University Campus"
}

# Apply unit mapping
df['Organization Unit'] = df['Organization Unit'].replace(unit_mapping)

# Designation mapping
post_mapping = {
    'Sr. Technician': "Sr. Technician",
    'Technical Assistant': "Technical Assistant",
    'Senior Supdtt': "Others",
    'Senior Technician': "Technician",
    'Tech. Asstt. SG I': "Technical Assistant",
    'Technician -SG-I': "Technician",
    'Technician': "Technician",
    'Senior Office Attendant': "Others",
    'Assistant Registrar': "Assistant Registrar",
    'Office Attendant SGII': "Others",
    'Deputy Registrar': "Deputy Registrar",
    'Stenographer': "Stenographer",
    'SG-II': "Others",
    'Senior Technical Assistant': "Senior Technical Assistant",
    'Superintendent': "Superintendent",
    'Asstt. SG-I': "Assistant Grade I",
    'Orderly': "Others",
    'Senior Assistant': "Senior Assistant",
    'Technical Officer': "Technical Officer",
    'Senior Attendant': "Others",
    'Junior Assistant': "Junior Assistant",
    'Tech . Asstt.-SG II': "Technical Assistant",
    'Sport and Student Activities Asst. SG I': "Others",
    'Office Attendant': "Others",
    'Senior Technical Assistant SGII': "Senior Technical Assistant",
    'Sr Asstt SG I': "Senior Assistant",
    'Junior Engineer': "Junior Engineer",
    'Assistant SG II': "Assistant Grade I",
    'Attendant SG-I': "Others",
    'Works Asstt. SG II (TL)': "Others",
    'Assistant Professor': "Assistant Professor",
    'Assistant  professor': "Assistant Professor",
    'Professor': "Professor",
    'Associate Professor': "Associate Professor",
    'Registrar': "Others"
}

# Apply post mapping
df['Post'] = df['Post'].replace(post_mapping)

df[['Name', 'Organization Unit', 'Post']].head()


Unnamed: 0,Name,Organization Unit,Post
0,Muzaffar,Information Technology,Sr. Technician
1,Mohammad Ashraf,Information Technology,Sr. Technician
2,Manzoor,Information Technology,Sr. Technician
3,Insha,ERP,Technical Assistant
4,Kalim Dar,Information Technology,Technical Assistant


In [21]:
# Remove entries where 'Name' is null or empty
# Ensure column is string, strip spaces, and drop real NaNs or string "nan"
df['Name'] = df['Name'].astype(str).str.strip()
df = df[~df['Name'].isin(['', 'nan', 'NaN', 'None'])]  # Remove empty or 'nan'-like strings


# Normalize 'Official Email' to lowercase
if 'Official Email' in df.columns:
    df['Official Email'] = df['Official Email'].astype(str).str.replace(r'\s+', '', regex=True)
    df['Official Email'] = df['Official Email'].str.lower()

df.to_excel("employee.xlsx", index=False)
df

Unnamed: 0,Salutation,Name,Gender,Date of Birth,Mobile Number,Type of Employee,Nature of Employment,Organization Unit,Post,Date of Joining,Official Email
0,Mr.,Muzaffar,Male,1965-05-01,9541047030,Non-Teaching,Permanent,Information Technology,Sr. Technician,1994-04-01,muzaffar123@gmail.com
1,Mr.,Mohammad Ashraf,Male,1972-05-05,7006984839,Non-Teaching,Permanent,Information Technology,Sr. Technician,1991-01-05,ashraf1232@gmail.com
2,Mr.,Manzoor,Male,1968-04-09,9797721521,Non-Teaching,Permanent,Information Technology,Sr. Technician,1991-01-04,manzoor.nit3@gmail.com
3,Ms.,Insha,Female,1993-05-25,7006752609,Non-Teaching,Permanent,ERP,Technical Assistant,2022-12-17,cseinsha@gmail.com
4,Mr.,Kalim Dar,Male,1988-05-16,9906481650,Non-Teaching,Permanent,Information Technology,Technical Assistant,2021-12-13,kaleem@nitsri.ac.in
...,...,...,...,...,...,...,...,...,...,...,...
428,Dr.,वसुंधरा सिंह,Female,1993-04-09,9451230799,Teaching,Temporary,Materials and Metallurgical Engineering,Assistant Professor,2024-03-21,bravevasu25@gmail.com
429,Dr.,Mehak,Female,1993-01-02,9797722278,Teaching,Temporary,Materials and Metallurgical Engineering,Assistant Professor,2024-04-01,khanmehak371@gmail.com
430,Dr.,Sirisha Nallakukkala,Female,1984-11-20,8125899138,Teaching,Temporary,Department of Chemical Engineering,Assistant Professor,2024-04-01,sirisha.nallakukkala@gmail.com
461,Dr.,Takbeer Salati,Female,1994-08-30,8899200353,Teaching,Temporary,Humanities Social Sciences and Management,Assistant Professor,,btakbeer@gmail.com
