In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import pandas as pd



In [15]:
# Load the provided dataset
df = pd.read_csv('/content/drive/MyDrive/xevensol/mlcode/sample_dataset_1000.csv')
df.shape



(1000, 8)

In [16]:
df.head(10)

Unnamed: 0,ID,Name,Age,Gender,Salary,Joining Date,Is Active,Comments
0,1,Paul,25,Female,90041.86,2017-01-31,False,Creative
1,2,Paul,31,Male,90285.02,2019-05-29,False,Needs Improvement
2,3,Rachel,30,Male,84321.65,2022-12-14,False,Hardworking
3,4,Bob,45,Male,68189.99,2023-09-05,True,Fast Learner
4,5,Eve,34,Male,35428.35,2018-05-19,True,Dedicated
5,6,Hannah,23,Other,33952.28,2021-02-02,False,Fast Learner
6,7,Oscar,23,Female,72692.5,2021-04-02,False,Needs Improvement
7,8,Tina,37,Other,76527.18,2022-04-21,True,Consistent
8,9,Steve,22,Male,90689.75,2021-08-26,True,Consistent
9,10,Frank,31,Other,100513.77,2018-01-12,True,Consistent


In [17]:
# Display basic information
print("Data types of each column:")
print(df.dtypes)

Data types of each column:
ID                int64
Name             object
Age               int64
Gender           object
Salary          float64
Joining Date     object
Is Active          bool
Comments         object
dtype: object


In [6]:
df.shape

(1000, 8)

In [7]:
df.dtypes

Unnamed: 0,0
ID,int64
Name,object
Age,int64
Gender,object
Salary,float64
Joining Date,object
Is Active,bool
Comments,object


In [18]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Name,0
Age,0
Gender,0
Salary,0
Joining Date,0
Is Active,0
Comments,0


In [8]:

# Convert Joining Date to a proper date format
#df['Joining Date'] = pd.to_datetime(df['Joining Date'], errors='coerce')

# Handling missing values

# If Age or Salary is missing, fill with the middle value (median)
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

# If Name or Gender is missing, fill with the most common value
df['Name'] = df['Name'].fillna(df['Name'].mode()[0])
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

# If Is Active is missing, fill with the most common value
df['Is Active'] = df['Is Active'].fillna(df['Is Active'].mode()[0])

# If Joining Date is missing, fill with the earliest date in the column
df['Joining Date'] = df['Joining Date'].fillna(df['Joining Date'].min())

# If Comments are missing, fill with "Unknown"
df['Comments'] = df['Comments'].fillna("Unknown")



print("Missing values have been filled and saved in 'cleaned_dataset.csv'")


Missing values have been filled and saved in 'cleaned_dataset.csv'


In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()  # Create LabelEncoder

df['Gender'] = le.fit_transform(df['Gender'])  # Encode Gender column
df['Is Active'] = le.fit_transform(df['Is Active'])  # Encode Is Active column

print(df.head())  # Show first few rows


   ID    Name  Age  Gender    Salary Joining Date  Is Active  \
0   1    Paul   25       0  90041.86   2017-01-31          0   
1   2    Paul   31       1  90285.02   2019-05-29          0   
2   3  Rachel   30       1  84321.65   2022-12-14          0   
3   4     Bob   45       1  68189.99   2023-09-05          1   
4   5     Eve   34       1  35428.35   2018-05-19          1   

            Comments  
0           Creative  
1  Needs Improvement  
2        Hardworking  
3       Fast Learner  
4          Dedicated  


In [20]:
import pandas as pd

# df = pd.read_csv("sample_dataset_1000.csv")  # Load dataset

# Apply pd.get_dummies for one-hot encoding
encoded_df = pd.get_dummies(df['Is Active'], prefix='Is_Active')

# Concatenate with original dataset and drop old column
df = pd.concat([df, encoded_df], axis=1).drop(columns=['Is Active'])

df.to_csv("cleaned_dataset.csv", index=False)  # Save updated dataset

print("Is Active converted to One-Hot Encoding and saved!")

Is Active converted to One-Hot Encoding and saved!


In [11]:
df.head(10)

Unnamed: 0,ID,Name,Age,Gender,Salary,Joining Date,Comments,Is_Active_0,Is_Active_1
0,1,Paul,25,0,90041.86,2017-01-31,Creative,True,False
1,2,Paul,31,1,90285.02,2019-05-29,Needs Improvement,True,False
2,3,Rachel,30,1,84321.65,2022-12-14,Hardworking,True,False
3,4,Bob,45,1,68189.99,2023-09-05,Fast Learner,False,True
4,5,Eve,34,1,35428.35,2018-05-19,Dedicated,False,True
5,6,Hannah,23,2,33952.28,2021-02-02,Fast Learner,True,False
6,7,Oscar,23,0,72692.5,2021-04-02,Needs Improvement,True,False
7,8,Tina,37,2,76527.18,2022-04-21,Consistent,False,True
8,9,Steve,22,1,90689.75,2021-08-26,Consistent,False,True
9,10,Frank,31,2,100513.77,2018-01-12,Consistent,False,True
