In [1]:
import pandas as pd
import numpy as np

In [2]:
employee_data = {
    "Employee_ID": [101, 102, 103, 104, 104, None, 107, 108, 109, 110],
    "Name": ["Alice", "Bob", "Charlie", "David", "David", "Eve", "Frank", "George", "Henry", "Isla"],
    "Age": [25, "thirty", 45, 28, 28, 35, 22, 29, 102, None],  # invalid string, outlier, missing
    "Salary": [50000, 60000, None, 52000, 52000, "55k", 49000, 48000, 47000, 46000],  # string salary, missing
    "Department": ["HR", "hr", "Engineering", "engineering", "ENGINEERING", "Finance", "Finance", "HR", None, "hr"],  # inconsistent case, missing
    "Join_Date": ["2020-01-15", "15-02-2020", "March 5, 2020", "2020/04/10", None, "2020-07-01", "2020.08.01", "2020-09-10", "2020-10-15", "N/A"],  # various formats, missing
    "Gender": ["F", "M", "Male", "female", "F", None, "M", "Other", "MALE", "FEMALE"],  # inconsistent labels
    "Work_Experience": [1, 3, 20, -1, 3, 5, 2, None, 100, 4],  # negative and extreme values
}
df = pd.DataFrame(employee_data)
df

Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101.0,Alice,25,50000,HR,2020-01-15,F,1.0
1,102.0,Bob,thirty,60000,hr,15-02-2020,M,3.0
2,103.0,Charlie,45,,Engineering,"March 5, 2020",Male,20.0
3,104.0,David,28,52000,engineering,2020/04/10,female,-1.0
4,104.0,David,28,52000,ENGINEERING,,F,3.0
5,,Eve,35,55k,Finance,2020-07-01,,5.0
6,107.0,Frank,22,49000,Finance,2020.08.01,M,2.0
7,108.0,George,29,48000,HR,2020-09-10,Other,
8,109.0,Henry,102,47000,,2020-10-15,MALE,100.0
9,110.0,Isla,,46000,hr,,FEMALE,4.0


In [3]:
df.isnull().sum()


Employee_ID        1
Name               0
Age                1
Salary             1
Department         1
Join_Date          1
Gender             1
Work_Experience    1
dtype: int64

In [4]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

Employee_ID        0
Name               0
Age                0
Salary             0
Department         0
Join_Date          0
Gender             0
Work_Experience    0
dtype: int64

In [6]:
df

Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101.0,Alice,25,50000,HR,2020-01-15,F,1.0
1,102.0,Bob,thirty,60000,hr,15-02-2020,M,3.0
3,104.0,David,28,52000,engineering,2020/04/10,female,-1.0
6,107.0,Frank,22,49000,Finance,2020.08.01,M,2.0


In [7]:
df['Employee_ID'] = df['Employee_ID'].astype('Int64')
df['Employee_ID'].dtype
df

Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1.0
1,102,Bob,thirty,60000,hr,15-02-2020,M,3.0
3,104,David,28,52000,engineering,2020/04/10,female,-1.0
6,107,Frank,22,49000,Finance,2020.08.01,M,2.0


In [8]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Age'] = df['Age'].astype('Int64')

df['Age'].fillna(df['Age'].mode()[0],inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mode()[0],inplace=True)


Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1.0
1,102,Bob,22,60000,hr,15-02-2020,M,3.0
3,104,David,28,52000,engineering,2020/04/10,female,-1.0
6,107,Frank,22,49000,Finance,2020.08.01,M,2.0


In [9]:
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
df['Salary'] = df['Salary'].astype('Int64')

df['Salary'].fillna(df['Salary'].mode()[0],inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mode()[0],inplace=True)


Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1.0
1,102,Bob,22,60000,hr,15-02-2020,M,3.0
3,104,David,28,52000,engineering,2020/04/10,female,-1.0
6,107,Frank,22,49000,Finance,2020.08.01,M,2.0


In [10]:
for i in df.index:
    if (df.loc[i,'Department']).lower().startswith('h'):
        df.loc[i,'Department'] = 'HR'
    elif (df.loc[i,'Department']).lower().startswith('e'):
        df.loc[i,'Department'] = 'Engineering'
    elif (df.loc[i,'Department']).lower().startswith('f'):
        df.loc[i,'Department'] = 'Finance'
    else:
        df.loc[i,'Department'] = df['Department'].mode()[0]

In [11]:
df

Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1.0
1,102,Bob,22,60000,HR,15-02-2020,M,3.0
3,104,David,28,52000,Engineering,2020/04/10,female,-1.0
6,107,Frank,22,49000,Finance,2020.08.01,M,2.0


In [12]:
df['Gender']

for i in df.index:
    if (df.loc[i,'Gender']).upper().startswith('F'):
        df.loc[i,'Gender'] = 'F'
    
    elif (df.loc[i,'Gender']).upper().startswith('M'):
        df.loc[i,'Gender'] = 'M'

    else:
        df.loc[i,'Gender'] = 'O'

df

Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1.0
1,102,Bob,22,60000,HR,15-02-2020,M,3.0
3,104,David,28,52000,Engineering,2020/04/10,F,-1.0
6,107,Frank,22,49000,Finance,2020.08.01,M,2.0


In [13]:
df['Work_Experience'] = pd.to_numeric(df['Work_Experience'], errors='coerce')
df['Work_Experience'] = df['Work_Experience'].astype('Int64')

df['Work_Experience'].fillna(df['Work_Experience'].mode()[0],inplace=True)

for i in df.index:
    if df.loc[i,'Work_Experience']<0:
        df.loc[i,'Work_Experience'] = 0
        

df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Work_Experience'].fillna(df['Work_Experience'].mode()[0],inplace=True)


Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1
1,102,Bob,22,60000,HR,15-02-2020,M,3
3,104,David,28,52000,Engineering,2020/04/10,F,0
6,107,Frank,22,49000,Finance,2020.08.01,M,2


In [15]:
df['Join_Date']

0    2020-01-15
1    15-02-2020
3    2020/04/10
6    2020.08.01
Name: Join_Date, dtype: object

In [17]:
df['Join_Date'] = pd.to_datetime(df['Join_Date'], format='mixed', dayfirst=True)
print(df['Join_Date'].dtype)

datetime64[ns]


In [20]:
print(df.dtypes)
df

Employee_ID                 Int64
Name                       object
Age                         Int64
Salary                      Int64
Department                 object
Join_Date          datetime64[ns]
Gender                     object
Work_Experience             Int64
dtype: object


Unnamed: 0,Employee_ID,Name,Age,Salary,Department,Join_Date,Gender,Work_Experience
0,101,Alice,25,50000,HR,2020-01-15,F,1
1,102,Bob,22,60000,HR,2020-02-15,M,3
3,104,David,28,52000,Engineering,2020-04-10,F,0
6,107,Frank,22,49000,Finance,2020-08-01,M,2


In [28]:
A = np.array([[1, 2], [3, 4]])  
B = np.array([[2, 0], [1, 2]])  
C = A * B  
print(C)  

[[2 0]
 [3 8]]
