In [11]:
#Mounting the google drive to google colab for reading csv files

from google.colab import drive

drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:

#importing necessary libraries and plugins

import pandas as pd
import datetime
import re
import shutil

# Reading the file from Google Drive and loading the dataset

path = '/content/drive/My Drive/Colab Notebooks/train.csv'
data = pd.read_csv(path)

def extract_numeric_value(text):
    if pd.notna(text):
        matches = re.findall(r'(\d+\.\d+|\d+)', str(text))
        if matches:
            return float(matches[0])
    return None


#First Task

# Checking for missing or null values in each column of train.csv
missing_values = data.isnull().sum()

#We have the option to either remove columns that contain a substantial amount of missing data or fill in these missing values.
#For instance, if the column 'New_Price' contains an excessive number of missing values, one possible approach is to eliminate this column from the dataset
data.drop(columns=["New_Price"], inplace=True)

#For remaining columns that have missing values, consider filling those gaps by replacing them with the average or middle value, such as the mean or median.
data['Mileage'] = data['Mileage'].str.extract('(\d+\.\d+)').astype(float)
data['Engine'] = data['Engine'].str.extract('(\d+)').astype(float)
data['Power'] = data['Power'].str.extract('(\d+\.\d+)').astype(float)


# Impute missing values with the mean or median
data['Mileage'].fillna(data['Mileage'].median(), inplace=True)
data['Engine'].fillna(data['Engine'].median(), inplace=True)
data['Power'].fillna(data['Power'].median(), inplace=True)
data["Seats"].fillna(data["Seats"].median(), inplace=True)


#Second Task

#Removing the units  kmpl from “Mileage”, CC from “Engine”, bhp from “Power”, and lakh from “New_price”

data["Mileage"] = data["Mileage"].apply(extract_numeric_value)
data["Engine"] = data["Engine"].apply(extract_numeric_value)
data["Power"] = data["Power"].apply(extract_numeric_value)


#Third Task

# Changing the categorical variables of the columns “Fuel_Type” and “Transmission”  into numerical one hot encoded value

data = pd.get_dummies(data, columns=["Fuel_Type", "Transmission"])



#Forth task

#Creating additional feature and adding this column to the dataset (you can use mutate function in R for this). For example, we can calculate the current age of the car by subtracting “Year” value from the current year.

current_year = datetime.datetime.now().year
data['Current_Age'] = current_year - data['Year']

# Displaying the modified dataset
print(data.head())


# Saving the DataFrame 'data' to a CSV file in Google Drive
data.to_csv('/content/drive/My Drive/Colab Notebooks/train_AfterTaskProcessed.csv', index=False)



   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  Price  \
0              41000      First    19.67  1582.0  126.20    5.0  12.50   
1              46000      First    18.19  1199.0   88.70    5.0   4.50   
2              87000      First    20.77  1248.0   88.76    7.0   6.00   
3              40670     Second    15.20  1968.0  140.80    5.0  17.74   
4              86999      First    23.08  1461.0   63.10    5.0   3.50   

   Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Petrol  \
0                 1                   0                 0   
1                 