Step a: Handling Missing Values

In [27]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/train.csv')

# Check for missing values
missing_values = df.isnull().sum()

# Impute missing values
for column in df.columns:
    if df[column].dtype == 'object':
        # Impute categorical columns with mode
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        # Impute numerical columns with mean
        df[column].fillna(df[column].mean(), inplace=True)

# Verify that there are no missing values
print(df.isnull().sum())


Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
New_Price            0
Price                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


Step b: Removing Units from Attributes

In [28]:
# Remove units from specified columns
df['Mileage'] = df['Mileage'].astype(str).str.replace(' kmpl', '').str.replace(' km/kg', '').astype(float)
df['Engine'] = df['Engine'].astype(str).str.replace(' CC', '').astype(float)
df['Power'] = df['Power'].astype(str).str.replace(' bhp', '').astype(float)

# For New_Price, handle both Lakh and Crore
df['New_Price'] = df['New_Price'].astype(str).str.replace(' Cr', '').str.replace(' Lakh', '').astype(float)
# Convert Crore values to Lakh (1 Crore = 100 Lakh)
mask = df['New_Price'].astype(str).str.contains('Cr')
df.loc[mask, 'New_Price'] *= 100

# Verify the changes
print(df[['Mileage', 'Engine', 'Power', 'New_Price']].head())


   Mileage  Engine   Power  New_Price
0    19.67  1582.0  126.20       4.78
1    13.00  1199.0   88.70       8.61
2    20.77  1248.0   88.76       4.78
3    15.20  1968.0  140.80       4.78
4    23.08  1461.0   63.10       4.78


Step c: One-Hot Encoding Categorical Variables

In [29]:
# One-hot encode categorical variables with integer dtype (0/1)
df = pd.get_dummies(df, columns=['Fuel_Type', 'Transmission'], drop_first=True, dtype=int)

# Verify the changes
print(df.head())

   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  New_Price  \
0              41000      First    19.67  1582.0  126.20    5.0       4.78   
1              46000      First    13.00  1199.0   88.70    5.0       8.61   
2              87000      First    20.77  1248.0   88.76    7.0       4.78   
3              40670     Second    15.20  1968.0  140.80    5.0       4.78   
4              86999      First    23.08  1461.0   63.10    5.0       4.78   

   Price  Fuel_Type_Electric  Fuel_Type_Petrol  Transmission_Manual  
0  12.50                   0                 0

Step d: Creating a New Feature

In [30]:
# Create a new feature 'Car_Age'
df['Car_Age'] = 2024 - df['Year']

# Verify the new feature
print(df[['Year', 'Car_Age']].head())


   Year  Car_Age
0  2015        9
1  2011       13
2  2012       12
3  2013       11
4  2013       11


Step e: Performing Data Operations

In [31]:
# Select specific columns
selected_columns = df[['Name', 'Location', 'Year', 'Price']]

# Filter rows where Price is greater than 10 lakhs
filtered_df = df[df['Price'] > 10]

# Rename the 'Price' column to 'Selling_Price'
renamed_df = df.rename(columns={'Price': 'Selling_Price'})

# Calculate Car_Age before creating Price_Per_Year
current_year = 2024  # You can adjust this based on your needs
mutated_df = df.copy()
mutated_df['Car_Age'] = current_year - mutated_df['Year']
mutated_df['Price_Per_Year'] = mutated_df['Price'] / mutated_df['Car_Age']

# Arrange by 'Price' in descending order
arranged_df = df.sort_values(by='Price', ascending=False)

# Summarize by calculating the average price per location
summary_df = df.groupby('Location')['Price'].mean().reset_index()

# Display the results
print("Selected Columns:\n", selected_columns.head())
print("\nFiltered DataFrame:\n", filtered_df.head())
print("\nRenamed DataFrame:\n", renamed_df.head())
print("\nMutated DataFrame:\n", mutated_df.head())
print("\nArranged DataFrame:\n", arranged_df.head())
print("\nSummary DataFrame:\n", summary_df.head())

Selected Columns:
                                Name    Location  Year  Price
0  Hyundai Creta 1.6 CRDi SX Option        Pune  2015  12.50
1                      Honda Jazz V     Chennai  2011   4.50
2                 Maruti Ertiga VDI     Chennai  2012   6.00
3   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013  17.74
4            Nissan Micra Diesel XV      Jaipur  2013   3.50

Filtered DataFrame:
     Unnamed: 0                               Name    Location  Year  \
0            1   Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
3            4    Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
5            7  Toyota Innova Crysta 2.8 GX AT 8S      Mumbai  2016   
11          13   Land Rover Range Rover 2.2L Pure       Delhi  2014   
12          14     Land Rover Freelander 2 TD4 SE        Pune  2012   

    Kilometers_Driven Owner_Type  Mileage  Engine  Power  Seats  New_Price  \
0               41000      First    19.67  1582.0  126.2    5.0       4.78   
3         