## 1. Import neccessory packages
##### pip install pandas matplotlib seaborn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Import and Read the dataset 

In [None]:
# Imort the dataset from a URL
dataset = 'https://github.com/sivasathish889/employee_data/raw/main/employee_data.csv'

# Load the dataset
df = pd.read_csv(dataset)

df.shape


## 3. Get the Information using info() 

In [None]:
print(df.info())

## 4. Check the missing values

In [None]:
# Check missing values
print("Missing values:\n", df.isnull().sum())


## 5. Check the duplicate values

In [None]:
df.duplicated().sum() 


## 6. Group by department and describe the YearsAtCompany column

In [None]:
ss = df.groupby('Department')['YearsAtCompany'].describe()  
ss

## 7. Average performance by department

In [None]:

dept_perf = df.groupby('Department')['PerformanceScore'].mean().sort_values(ascending=False)
print("Average performance by department:\n", dept_perf)

df[df['Department']=='PerformanceScore']


## 8. Plot performance by department

In [None]:

plt.figure(figsize=(10,6))
sns.barplot(x=dept_perf.values, y=dept_perf.index, palette='viridis', hue=dept_perf.index)
plt.title('Average Performance by Department')
plt.xlabel('Performance Score')
plt.ylabel('Department')
plt.show()

## 9. Productivity vs Experience


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='YearsAtCompany', y='ProjectsCompleted', hue='PerformanceScore', palette='coolwarm')
plt.title('Experience vs Projects Completed')
plt.xlabel('Years at Company')
plt.ylabel('Projects Completed')
plt.show()

## 10. Monthly hours vs Performance


In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(x='PerformanceScore', y='MonthlyHoursWorked', data=df, palette='Set2',hue='PerformanceScore')
plt.title('Monthly Hours Worked by Performance Score')
plt.xlabel('Performance Score')
plt.ylabel('Monthly Hours Worked')
plt.show()


## 11. Correlation matrix


In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(df[['Age', 'YearsAtCompany', 'PerformanceScore', 'MonthlyHoursWorked', 'ProjectsCompleted']].corr(), annot=True, cmap='Blues')
plt.title('Correlation Matrix')
plt.show()

## 12. High performers analysis and Save the summary

In [None]:

high_perf = df[df['PerformanceScore'] >= 4]
print("High performers breakdown by department:\n", high_perf['Department'].value_counts())

# Save summary
summary = df.groupby('Department').agg({
    'PerformanceScore': 'mean',
    'ProjectsCompleted': 'sum',
    'MonthlyHoursWorked': 'mean'
}).reset_index()

summary.to_csv('department_summary.csv', index=False)
