In [68]:
import pandas as pd
import numpy as np


## IMPORTING THE DATASET


In [69]:
task = pd.read_csv('task.csv')   
task

Unnamed: 0,user_id,age,salary,department,join_date,city,performance_score,experience_years,is_active
0,user_0,-5,₹50000,HR,2020/12/01,DELHI,3,five,True
1,U1001,30,45000,Finance,15/02/2021,Delhi,3,1,yes
2,U1002,30,25000,HR,,Delhi,excellent,10,False
3,,unknown,,HR,invalid,Hyderabad,excellent,5,yes
4,,,120000.75,IT,,DELHI,,-1,False
...,...,...,...,...,...,...,...,...,...
995,,60,₹50000,HR,15/02/2021,DELHI,2,five,
996,U1996,,,IT,15/02/2021,Mumbai,,5,no
997,user_997,unknown,25000,HR,invalid,Mumbai,3,5,True
998,U1998,-5,25000,,15/02/2021,Bangalore,1,0,True


## TASK-1

In [70]:
task.shape

(1000, 9)

In [71]:
task.columns

Index(['user_id', 'age', 'salary', 'department', 'join_date', 'city',
       'performance_score', 'experience_years', 'is_active'],
      dtype='object')

In [72]:
task.dtypes

user_id              object
age                  object
salary               object
department           object
join_date            object
city                 object
performance_score    object
experience_years     object
is_active            object
dtype: object

## TASK-2

In [73]:
missing_percentage = task.isnull().mean() * 100
missing_percentage

user_id              51.0
age                  11.4
salary               31.1
department           14.1
join_date            21.9
city                 14.9
performance_score    10.1
experience_years     12.7
is_active            20.0
dtype: float64

## TASK-3

In [74]:
task.apply(lambda x: x.map(type).nunique())

user_id              2
age                  2
salary               2
department           2
join_date            2
city                 2
performance_score    2
experience_years     2
is_active            2
dtype: int64

## TASK-4

In [75]:
task['user_id'] = task['user_id'].replace('', np.nan).str.upper()
task['user_id']

0        USER_0
1         U1001
2         U1002
3           NaN
4           NaN
         ...   
995         NaN
996       U1996
997    USER_997
998       U1998
999         NaN
Name: user_id, Length: 1000, dtype: object

## TASK-5

In [76]:
task['age'] = pd.to_numeric(task['age'], errors="coerce")
task.loc[(task['age'] < 0) | (task['age'] > 100), 'age'] = np.nan
task['age']

0       NaN
1      30.0
2      30.0
3       NaN
4       NaN
       ... 
995    60.0
996     NaN
997     NaN
998     NaN
999    30.0
Name: age, Length: 1000, dtype: float64

## TASK-6

In [77]:
task['salary'] = pd.to_numeric(
    task['salary'].astype(str).str.replace('₹', '', regex=False),
    errors='coerce'
)
task['salary']

0       50000.00
1       45000.00
2       25000.00
3            NaN
4      120000.75
         ...    
995     50000.00
996          NaN
997     25000.00
998     25000.00
999     45000.00
Name: salary, Length: 1000, dtype: float64

## TASK-7

In [78]:
task['join_date'] = pd.to_datetime(
    task['join_date'],
    errors='coerce',
    dayfirst=True
)
task['join_date']

0     2020-01-12
1            NaT
2            NaT
3            NaT
4            NaT
         ...    
995          NaT
996          NaT
997          NaT
998          NaT
999          NaT
Name: join_date, Length: 1000, dtype: datetime64[ns]

## TASK-8

In [79]:
task['joining_year'] = task['join_date'].dt.year
task['joining_year']

0      2020.0
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
995       NaN
996       NaN
997       NaN
998       NaN
999       NaN
Name: joining_year, Length: 1000, dtype: float64

## TASK-9

In [80]:
task['department'].str.strip().str.upper()

0           HR
1      FINANCE
2           HR
3           HR
4           IT
        ...   
995         HR
996         IT
997         HR
998        NaN
999         HR
Name: department, Length: 1000, dtype: object

## TASK-10

In [81]:
task['city']=task['city'].str.strip().str.title()
task['city']

0          Delhi
1          Delhi
2          Delhi
3      Hyderabad
4          Delhi
         ...    
995        Delhi
996       Mumbai
997       Mumbai
998    Bangalore
999    Hyderabad
Name: city, Length: 1000, dtype: object

## TASK-11

In [87]:
performance_map= {'poor' : 0,'excellent' : 10}
task['performance_score']=task["performance_score"].map(performance_map)
task['performance_score']=pd.to_numeric(task["performance_score"],errors="coerce")
task['performance_score']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
995   NaN
996   NaN
997   NaN
998   NaN
999   NaN
Name: performance_score, Length: 1000, dtype: float64

## TASK-12

In [83]:
task['experience_years'] = pd.to_numeric(task['experience_years'], errors='coerce')
task.loc[task['experience_years'] < 0, 'experience_years'] = np.nan
task['experience_years']

0       NaN
1       1.0
2      10.0
3       5.0
4       NaN
       ... 
995     NaN
996     5.0
997     5.0
998     0.0
999     NaN
Name: experience_years, Length: 1000, dtype: float64

## TASK-13

In [67]:
task['is_active'] = task['is_active'].map({'yes': True, 'no': False, True: True, False: False})
task['is_active']

0        NaN
1       True
2        NaN
3       True
5      False
       ...  
990      NaN
991     True
996    False
997      NaN
998      NaN
Name: is_active, Length: 491, dtype: object

## TASK-14

In [63]:
task = task.drop_duplicates(subset='user_id')
task

Unnamed: 0,user_id,age,salary,department,join_date,city,performance_score,experience_years,is_active,joining_year
0,USER_0,,,HR,2020-01-12,Delhi,,five,,2020.0
1,U1001,30.0,45000.0,Finance,NaT,Delhi,,1,True,
2,U1002,30.0,25000.0,HR,NaT,Delhi,,10,,
3,,,,HR,NaT,Hyderabad,,5,True,
5,USER_5,,25000.0,IT,NaT,,,0,False,
...,...,...,...,...,...,...,...,...,...,...
990,U1990,,,Sales,NaT,Hyderabad,,-1,,
991,U1991,45.0,,HR,NaT,Mumbai,,-1,True,
996,U1996,,,IT,NaT,Mumbai,,5,False,
997,USER_997,,25000.0,HR,NaT,Mumbai,,5,,


## TASK-15

In [66]:
task.info
task.head

<bound method NDFrame.head of       user_id   age   salary department  join_date       city  \
0      USER_0   NaN      NaN         HR 2020-01-12      Delhi   
1       U1001  30.0  45000.0    Finance        NaT      Delhi   
2       U1002  30.0  25000.0         HR        NaT      Delhi   
3         NaN   NaN      NaN         HR        NaT  Hyderabad   
5      USER_5   NaN  25000.0         IT        NaT        NaN   
..        ...   ...      ...        ...        ...        ...   
990     U1990   NaN      NaN      Sales        NaT  Hyderabad   
991     U1991  45.0      NaN         HR        NaT     Mumbai   
996     U1996   NaN      NaN         IT        NaT     Mumbai   
997  USER_997   NaN  25000.0         HR        NaT     Mumbai   
998     U1998   NaN  25000.0        NaN        NaT  Bangalore   

     performance_score experience_years is_active  joining_year  
0                  NaN             five       NaN        2020.0  
1                  NaN                1      True        