In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv('https://raw.githubusercontent.com/arora123/Data/master/emp-data.csv')

# Check for non-numeric entries in Salary before calculations
if not pd.api.types.is_numeric_dtype(data['Salary']):
    print("Warning: Salary column contains non-numeric entries. Removing dollar signs and converting to numeric values.")
    data['Salary'] = data['Salary'].str.replace('[\$,]', '', regex=True).astype(float)  # Remove $ and convert to float

# Gender count
gender_count = data['Gender'].value_counts()
print(gender_count)

# Department-wise gender count
dept_gender_count = data.groupby(['Department', 'Gender']).size()
print(dept_gender_count)

# Location-wise gender count (assuming "City" for location, replace if different)
loc_column = 'Location' if 'Location' in data.columns else 'City'
loc_gender_count = data.groupby(['Loc', 'Gender']).size()
print(loc_gender_count)

# Department with highest average salary (ensure numeric Salary data)
valid_data = data[data['Salary'].notna()]  # Filter for rows with non-missing Salary values
avg_dept_pay = valid_data.groupby('Department')['Salary'].mean()
print(avg_dept_pay.idxmax())

# Location with highest average salary
avg_loc_pay = data.groupby('Loc')['Salary'].mean()
print(avg_loc_pay.idxmax())

# Rating distribution (percentage)
rating_count = data['Rating'].value_counts(normalize=True) * 100
print(rating_count)

# Department-wise gender pay gap (corrected)
dept_gender_pay_gap = data.groupby(['Department', 'Gender'])['Salary'].mean().unstack()
gender_pay_gap = dept_gender_pay_gap['Male'] - dept_gender_pay_gap['Female']  # Calculate difference directly
print(gender_pay_gap)

# Location-wise gender pay gap (corrected)
loc_gender_pay_gap = data.groupby(['Loc', 'Gender'])['Salary'].mean().unstack()

# Filter for numeric Salary data before calculating percentage gap
numeric_data = loc_gender_pay_gap.select_dtypes(include=[np.number])

# Calculate percentage gap using the filtered data
gender_pay_gap = (numeric_data['Male'] - numeric_data['Female']) / numeric_data['Male'] * 100
print(gender_pay_gap)


Gender
Male      501
Female    471
Name: count, dtype: int64
Department                Gender
Accounting                Female    30
                          Male      40
Business Development      Female    42
                          Male      38
Engineering               Female    38
                          Male      38
Human Resources           Female    44
                          Male      39
Legal                     Female    36
                          Male      51
Marketing                 Female    33
                          Male      35
Product Management        Female    42
                          Male      49
Research and Development  Female    39
                          Male      34
Sales                     Female    38
                          Male      41
Services                  Female    42
                          Male      38
Support                   Female    37
                          Male      43
Training                  Female    38
         