# importing libraries

In [1]:
## We first load in packages we will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set_style("whitegrid")
import re

# Loading data

In [2]:
cars = pd.read_csv('../../Data/used_cars.csv')

In [3]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB


In [4]:
df = cars.copy()  #makinga copy of original data set
print('Original Data Shape:',df.shape)


Original Data Shape: (4009, 12)


# Explainatory Data Analysis and data cleaning

## checking for infinite values

In [5]:
def check_infinity_values(data):
    # Select only numeric columns from the dataframe
    numeric_cols = data.select_dtypes(include=[np.number])
    
    # Check for positive and negative infinity values in numeric columns
    infinity_check = numeric_cols.applymap(lambda x: np.isinf(x))
    
    # Sum the infinity values per column
    infinity_count_per_column = infinity_check.sum()

    # Find rows that contain infinity values
    infinity_rows = data[infinity_check.any(axis=1)]
    
    return infinity_count_per_column, infinity_rows


In [6]:
infinity_count_df, infinity_rows_df = check_infinity_values(df)

# Print the number of infinity values in each column
print("Infinity count per column:")
print(infinity_count_df)

# Print the rows containing infinity values
print("\nRows with infinity values:")
print(infinity_rows_df)

Infinity count per column:
model_year    0
dtype: int64

Rows with infinity values:
Empty DataFrame
Columns: [brand, model, model_year, milage, fuel_type, engine, transmission, ext_col, int_col, accident, clean_title, price]
Index: []


  infinity_check = numeric_cols.applymap(lambda x: np.isinf(x))


## Extracting age of the vehicle in years, i.e., years of usage till current year

In [7]:
import datetime   #changing model year to new column with year used by curent year - model year

curr_time = datetime.datetime.now()

def year_used(x):
    return curr_time.year-x

df['years_used']=df['model_year'].apply(year_used)



## Converting price column to int

In [8]:
# Function to remove '$' and ',' and convert the string to float
def price_to_float(x):
    return float(x.replace('$', '').replace(',', ''))

df['price'] = df['price'].apply(price_to_float)


## Filling null values with No in clean_title column 

In [9]:
# df['clean_title'].fillna('No', inplace=True)
# df["clean_title"] = df["clean_title"].replace({"Yes":1,"No":0})
def clean_title_column(data):
    # Fill NaN values with 'No', assign back to the column
    data['clean_title'] = data['clean_title'].fillna('No')
    # Replace "Yes" with 1 and "No" with 0, assign the result back to the column
    data['clean_title'] = data['clean_title'].replace({'Yes': 1, 'No': 0})
    # Explicitly retain downcasting behavior
    data['clean_title'] = data['clean_title'].infer_objects(copy=False)

# Apply the function to df, train, and test datasets
clean_title_column(df)


  data['clean_title'] = data['clean_title'].replace({'Yes': 1, 'No': 0})


## Extracting features from the engine column 
such as engine capacity, horsepower, cylinder count, fuel type, fuel injection and engine type.

In [10]:
def extract_capacity(x):
    # Expression to match the patterns: number followed by "L" or " Liter" 
    pattern = r'(\d+\.?\d*)\s*(L|Liter)'
     # Perform regex search
    match = re.search(pattern, x)
    
    # If a match is found, convert the first group (number) to float
    if match:
        return float(match.group(1))
    else:
        return None  # Return None if no match is found

# Apply the function to the 'engine' column
df['capacity'] = df['engine'].apply(extract_capacity)

# df['capacity'] = df['engine'].str.extract(pattern)[0].astype('float')

In [11]:
def extract_horsepower(x):
    # Expression to match the patterns: number followed by "HP" 
    pattern = r'(\d+(\.\d+)?)HP'
     # Perform regex search
    match = re.search(pattern, x)
    
    # If a match is found, convert the first group (number) to float
    if match:
        return float(match.group(1))
    else:
        return None  # Return None if no match is found

# Apply the function to the 'engine' column
df['horsepower'] = df['engine'].apply(extract_horsepower)



In [12]:
'At least 1 accident or damage reported'
def extract_engine_info(data):
    # Expression to match patterns like 'V6 Cylinder', '6 Cylinder', or 'I4'
    pattern = r'(V\d+|I\d+|Flat \d+|Straight \d+|\d Cylinder)'
    
    # Extract the numeric cylinder count into a new column 'engine_1'
    data['engine_1'] = data['engine'].str.extract(pattern, flags=re.IGNORECASE)
    
    # Remove unwanted words like "Cylinder", "_", and "-"
    data['engine_1'] = data['engine_1'].str.replace(r'Cylinder|_|-', '', regex=True)
    
    # Extract the engine type (V, I, Flat, etc.) and the numeric cylinder count
    data['engine_type'] = data['engine_1'].str.extract(r'([A-Za-z]+)')
    data['cylinder_count'] = data['engine_1'].str.extract(r'(\d+)')
    
    # Convert 'cylinder_count' to numeric (int type)
    data['cylinder_count'] = pd.to_numeric(data['cylinder_count'], errors='coerce')
    
    # Drop the intermediate column 'engine_1'
    data.drop(['engine_1'], axis=1, inplace=True)
    
    return data

# Apply the function to each dataset (df, train, test)
df = extract_engine_info(df)


In [13]:
def extract_fuel_type(fuel_type, engine):
    fuel_types = ['Gasoline', 'Diesel', 'Electric', 'Hybrid', 'Flex Fuel']
    
    # Check if the input is a valid string (i.e., not NaN or None)
    def match_fuel(source):
        if isinstance(source, str):
            # Normalize the fuel type string for case-insensitive matching
            for fuel in fuel_types:
                if fuel.lower() in source.lower():
                    return fuel
        return None
    
    # Check both 'fuel_type' and 'engine' columns
    fuel_match = match_fuel(fuel_type)
    engine_match = match_fuel(engine)
    
    # If either has a match, return the fuel type, otherwise return None
    return fuel_match if fuel_match else engine_match

# Apply the function across the two columns
df['fuel_type'] = df.apply(lambda row: extract_fuel_type(row['fuel_type'], row['engine']), axis=1)

df['fuel_type'] = np.where(df['brand'] == 'Tesla' , 'Electric', df['fuel_type'])


In [14]:
def extract_fi(x):
    #expression to match the fuel injection types
    value = re.search(r'(?i)(PDI|GDI|MPFI|TFSI|DDI|SIDI|GTDI|TSI|VDI)', x)
    return value.group(0) if value else None

# Apply the function to the 'engine' column and create a new column 'fuel_injection'
df['fuel_injection'] = df['engine'].apply(extract_fi)


## Imputing missing columns of accident with 'none reported'

In [15]:
def clean_accident_column(data):
    # Fill NaN values with 'None reported', assign back to the column
    data['accident'] = data['accident'].fillna('None reported')
    # Replace "At least 1 accident or damage reported" with 1 and "None reported" with 0, assign the result back to the column
    data['accident'] = data['accident'].replace({'At least 1 accident or damage reported':1,"None reported":0})
    # Explicitly retain downcasting behavior
    data['accident'] = data['accident'].infer_objects(copy=False)
    return data

# Apply the function to df, train, and test datasets
df = clean_accident_column(df)


  data['accident'] = data['accident'].replace({'At least 1 accident or damage reported':1,"None reported":0})


## converting mileage to int and Extracting information from mileage
such as mileage square to get rid of skewness in histogram. Grouping the mileage with brand, age of vehicle and mileage per year to get intresting features.


In [16]:
# Function to remove ',' and convert the string to int
def milage_to_int(x):
    # Remove commas and extract numeric digits
    # First, ensure that x is a string before applying replacements
    if isinstance(x, str):
        x = x.replace(',', '')
    
    # Extract numeric part and convert to integer
    return int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else None

df['milage']=df['milage'].apply(milage_to_int)


#df[['milage']] = df['milage'].str.replace(',','').str.extract(r'(\d+)').astype('float')

In [17]:
def extract_milage_features(data):
    # Conditional calculation to handle zero years of usage
    data['mileage_per_year'] = np.where(
        data['years_used'] == 0,
        data['milage'],  # Use total mileage if Vehicle_Age is zero
        data['milage'] / data['years_used']  # Regular calculation otherwise
    )
    data['mileage_sqrt'] = np.sqrt(data['milage'])
    data['mileage_with_age'] =  data.groupby('years_used')['milage'].transform('mean')
    data['mileage_per_year_with_age'] =  data.groupby('years_used')['mileage_per_year'].transform('mean')
    data['mileage_brand'] =  data.groupby('brand')['milage'].transform('mean')
    data['mileage']= data['milage']
    return data

df = extract_milage_features(df)
df = df.drop(['milage'], axis=1)


## Extracting speed information and type of transmission from the transmission column

In [18]:
def extract_speed(data):
    # Expression to match speed values in the 'transmission' column
    pattern = r'(?i)(\d+)-?\s?(?:speed|spd)?'
    
    # Extracting speed values using str.extract
    data['speed'] = data['transmission'].str.extract(pattern)
    
    # Convert extracted speeds to numeric, handling NaNs
    data['speed'] = pd.to_numeric(data['speed'], errors='coerce')

# Apply the function to df, train, and test datasets
extract_speed(df)


In [19]:
df['transmission'] = df['transmission'].str.strip().str.lower()


transmission_mapping = {

    '6-speed a/t': 'Automatic',
    '8-speed automatic': 'Automatic',
    'automatic': 'Automatic',
    '7-speed a/t': 'Automatic',
    'a/t': 'Automatic',
    '8-speed a/t': 'Automatic',
    'transmission w/dual shift mode': 'Automatic',
    '9-speed automatic': 'Automatic',
    '10-speed automatic': 'Automatic',
    '1-speed automatic': 'Automatic',
    '1-speed a/t': 'Automatic',
    '2-speed a/t': 'Automatic',
    '2-speed automatic': 'Automatic',
    '4-speed a/t': 'Automatic',
    '5-speed automatic': 'Automatic',
    '4-speed automatic': 'Automatic',
    '6-speed automatic': 'Automatic', 
    '9-speed a/t': 'Automatic',        
    '10-speed a/t': 'Automatic',      
    '7-speed automatic': 'Automatic',  
    '6-speed electronically controlled automatic with o': 'Automatic',
    'single-speed fixed gear': 'Automatic',
    '7-speed dct automatic': 'Automatic',
    '10-speed automatic with overdrive': 'Automatic',
    'automatic, 9-spd 9g-tronic': 'Automatic',
    'automatic, 8-spd': 'Automatic',
    'automatic, 8-spd sport w/sport & manual modes': 'Automatic',
    'automatic, 8-spd pdk dual-clutch': 'Automatic',
    'automatic, 8-spd m steptronic w/drivelogic, sport & manual modes': 'Automatic',
    'automatic, 8-spd dual-clutch': 'Automatic',
    'transmission overdrive switch': 'Automatic',
    'auto, 6-spd w/cmdshft': 'Automatic',
    'automatic, 10-spd': 'Automatic',
    'automatic, 7-spd s tronic dual-clutch': 'Automatic',
    
 
    '7-speed automatic with auto-shift': 'Tiptronic',
    '6-speed automatic with auto-shift': 'Tiptronic',
    '9-speed automatic with auto-shift': 'Tiptronic',
    '8-speed automatic with auto-shift': 'Tiptronic',
    '5-speed a/t': 'Tiptronic',
    '7-speed a/t tiptronic': 'Tiptronic',  
    '8-speed at': 'Tiptronic',
    '8-speed a/t': 'Tiptronic',
    
 
    '6-speed m/t': 'Manual',
    '7-speed m/t': 'Manual',
    '6-speed manual': 'Manual',
    '5-speed m/t': 'Manual',
    'manual': 'Manual',
    '7-speed manual': 'Manual',
    '8-speed manual': 'Manual',
    'm/t': 'Manual',
    '6 speed at/mt': 'Manual',
    '6 speed mt': 'Manual',
    'manual, 6-spd': 'Manual',

 
    'automatic cvt': 'Variator',
    'cvt transmission': 'Variator',
    'cvt-f': 'Variator',
    
  
    'variable': 'Variator',
    'f': 'Other',                        
    '7-speed': 'Other',                 
    '6-speed': 'Other',                  
    '2': 'Other',                       
    '–': 'Other',
    'scheduled for or in production': 'Other'
}


df['transmission'] = df['transmission'].replace(transmission_mapping)


## simplifying the external colour and internal colour column of data by extracting a particular set of colours and leaving rest as 'others'.

In [20]:
color=df['ext_col'].value_counts(normalize=True) * 100
color.head(20)

ext_col
Black                                 22.574208
White                                 20.354203
Gray                                  12.372163
Silver                                 9.329010
Blue                                   8.705413
Red                                    6.510352
Green                                  1.845847
Brown                                  1.072587
Gold                                   1.047643
Beige                                  0.947867
Orange                                 0.897980
Yellow                                 0.723372
–                                      0.374158
Summit White                           0.349214
Bright White Clearcoat                 0.349214
Diamond Black                          0.349214
Purple                                 0.299327
Alpine White                           0.199551
Granite Crystal Clearcoat Metallic     0.174607
Silver Ice Metallic                    0.174607
Name: proportion, dtype: float64

In [21]:
def extract_color(spec):
   #expression to match color names
   #value = re.search(r'(?i)(Black|White|Gray|Silver|Blue|Red|Green|Gold|Brown|Orange|Beige|Yellow|Ebony)', spec)
    value = re.search(r'(?i)(Black|White|Gray|Silver|Blue|Red|Green)', spec)
    return value.group(0).lower() if value else 'other'

df['ext_col'] = df['ext_col'].apply(extract_color)
df['int_col'] = df['int_col'].apply(extract_color)


## Extracting features from brand column by categorizing it as luxury or economic brands. 

In [22]:
def extract_brand_features(data):
    # Define luxury and economy brands in lower case for case-insensitive matching
    luxury_brands = [brand.lower() for brand in [
    'Bugatti', 'Lamborghini', 'Rolls-Royce', 'Bentley', 'McLaren', 'Ferrari', 'Aston',
    'Rivian', 'Porsche', 'Maybach', 'Maserati', 'Lucid', 'Tesla', 'Mercedes-Benz', 'Audi', 
    'BMW', 'Acura', 'Lexus', 'Jaguar', 'Land', 'Volvo', 'Ferrari', 'Lincoln', 'Infiniti', 
    'Genesis', 'Cadillac' ,  'Lotus' , 'Karma' , 'Polestar', 'Alfa Romeo', 'Alfa'
    ]]

    economy_brands = [brand.lower() for brand in [
    'smart', 'Saturn', 'MINI', 'Hummer', 'Scion', 'Toyota', 'Honda', 'Hyundai', 'Ford', 'Chevrolet',
    'Kia', 'Nissan', 'Subaru', 'Volkswagen', 'Mazda', 'Dodge', 'Fiat', 'Mitsubishi', 'Suzuki', 'Renault', 
    'Jeep','GMC','RAM','Buick','Chrysler','Pontiac','Saturn','Mercury','Saab', 'Plymouth'
    ]]

    # Function to categorize brands
    def categorize_brand(brand):
        brand_lower = brand.lower()
        if brand_lower in luxury_brands:
            return 'luxury'
        elif brand_lower in economy_brands:
            return 'economy'
        else:
            return 'other'

    # Apply the function to the 'brand' column
    data['brand_category'] = data['brand'].apply(categorize_brand)

    return data

df = extract_brand_features(df)


# statistical information of data

In [23]:
print('Statistical Information of Original Set:\n')
df.describe(include='all').T

Statistical Information of Original Set:



Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
brand,4009.0,57.0,Ford,386.0,,,,,,,
model,4009.0,1898.0,M3 Base,30.0,,,,,,,
model_year,4009.0,,,,2015.51559,6.104816,1974.0,2012.0,2017.0,2020.0,2024.0
fuel_type,3962.0,5.0,Gasoline,3309.0,,,,,,,
engine,4009.0,1146.0,2.0L I4 16V GDI DOHC Turbo,52.0,,,,,,,
transmission,4009.0,5.0,Automatic,2986.0,,,,,,,
ext_col,4009.0,8.0,black,1015.0,,,,,,,
int_col,4009.0,8.0,black,2147.0,,,,,,,
accident,4009.0,,,,0.245947,0.430701,0.0,0.0,0.0,0.0,1.0
clean_title,4009.0,,,,0.851334,0.355803,0.0,1.0,1.0,1.0,1.0


## removing duplicates

In [24]:
# Remove duplicate rows based on all columns
df.drop_duplicates(keep='first', inplace=True)

# Analysing null values

In [25]:
# Let see NUll values
def missing(data):
    missing = data.isnull().sum().reset_index()
    missing.columns = ['Features','Missing_Count']
    missing['Missing%'] = missing['Missing_Count']/data.shape[0]*100
    
    return missing[missing['Missing_Count']>0]
print('original dataset')
print(missing(df))

original dataset
          Features  Missing_Count   Missing%
3        fuel_type             47   1.172362
12        capacity            217   5.412821
13      horsepower            808  20.154652
14     engine_type           2279  56.847094
15  cylinder_count            440  10.975306
16  fuel_injection           3471  86.580195
23           speed           1833  45.722125


In [26]:
correlation_matrix = df[['horsepower', 'cylinder_count', 'capacity']].corr()
correlation_matrix

Unnamed: 0,horsepower,cylinder_count,capacity
horsepower,1.0,0.50295,0.638429
cylinder_count,0.50295,1.0,0.72794
capacity,0.638429,0.72794,1.0


In [27]:
# Extract continuous features: numeric columns with more than 35 unique values
continuous_feat_df = [feat for feat in df.select_dtypes(exclude='O') if df[feat].nunique() > 35]

# Extract discrete features: numeric columns with 35 or fewer unique values
discrete_feat_df = [feat for feat in df.select_dtypes(exclude='O') if df[feat].nunique() <= 35]

# Extract categorical features: non-numeric columns (object/string types)
categorical_feat_df = [feat for feat in df.select_dtypes(include='O')]

# Print the lists of features
print('Continuous Features:', continuous_feat_df)
print('Discrete Features:', discrete_feat_df)
print('Categorical Features:', categorical_feat_df)

Continuous Features: ['price', 'capacity', 'horsepower', 'mileage_per_year', 'mileage_sqrt', 'mileage_brand', 'mileage']
Discrete Features: ['model_year', 'accident', 'clean_title', 'years_used', 'cylinder_count', 'mileage_with_age', 'mileage_per_year_with_age', 'speed']
Categorical Features: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'engine_type', 'fuel_injection', 'brand_category']


In [28]:
df.columns

Index(['brand', 'model', 'model_year', 'fuel_type', 'engine', 'transmission',
       'ext_col', 'int_col', 'accident', 'clean_title', 'price', 'years_used',
       'capacity', 'horsepower', 'engine_type', 'cylinder_count',
       'fuel_injection', 'mileage_per_year', 'mileage_sqrt',
       'mileage_with_age', 'mileage_per_year_with_age', 'mileage_brand',
       'mileage', 'speed', 'brand_category'],
      dtype='object')

In [29]:
# researched most frequent data points from web, this can be excluded if you want

df.loc[df['fuel_type'] == 'Electric', ['speed', 'fuel_injection', 'engine_type', 'cylinder_count']] = [1.0,0,0,0]
df.loc[(df['fuel_type'] == 'Electric') & (df['brand'] == 'Porsche'), ['speed']] = 2.0
df.loc[df['model'] == '911 Carrera Cabriolet', ['fuel_type']] = 'Gasoline'
df.loc[df['model'] == 'Civic EX', ['fuel_type']] = 'Gasoline'
df.loc[(df['brand'] == 'Chrysler') & (df['model'] == 'Pacifica Touring') & (df['model_year'] == 2017), 
       ['fuel_type', 'capacity', 'horsepower', 'engine_type', 'cylinder_count', 'speed','fuel_injection']] = ['Gasoline', 3.6, 287.0, 'V', 6.0, 9.0,'MPFI']
df.loc[(df['brand'] == 'Cadillac') & (df['model'] == 'DeVille Base') & (df['model_year'] == 1994),	 
       ['fuel_type', 'capacity', 'horsepower', 'engine_type', 'cylinder_count', 'speed','fuel_injection']] = ['Gasoline', 4.9, 200.0, 'V', 8.0, 4.0,'SFI']
df.loc[(df['brand'] == 'Dodge') & (df['model'] == 'Challenger R/T') & (df['model_year'] == 2010) & (df['transmission']== 'Automatic'),	 
       ['fuel_type', 'capacity', 'horsepower', 'engine_type', 'cylinder_count', 'speed','fuel_injection']] = ['Gasoline', 5.7, 372.0, 'V', 8.0, 5.0,'MPFI']
df.loc[(df['brand'] == 'Dodge') & (df['model'] == 'Challenger R/T') & (df['transmission']== 'Tiptronic'),	 
       ['engine_type','fuel_injection']] = [ 'V','MPFI']
df.loc[(df['brand'] == 'Toyota') & (df['model'] == 'Land Cruiser Base'),['fuel_injection','engine_type']] = ['EFI','I']
df.loc[(df['brand'] == 'Toyota') & (df['model'] == 'Land Cruiser Base') & (df['model_year'] < 2000) ,['horsepower','fuel_type','cylinder_count']] = [212.0,'Gasoline',6.0]
df.loc[(df['brand'] == 'Toyota') & (df['model'] == 'Land Cruiser Base') & (df['model_year'] <= 2000) ,['speed']] = 4.0
df.loc[(df['brand'] == 'Toyota') & (df['model'] == 'Land Cruiser Base') & (df['model_year'] > 2000) ,['speed']] = 6.0
df.loc[(df['brand'] == 'Toyota') & (df['model'] == 'Land Cruiser Base') & (df['model_year'] > 2010 ) ,['speed']] = 8.0
df.loc[(df['brand'] == 'RAM') & (df['model'] == '1500 Laramie') ,['horsepower']] = 395.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2020) & (df['capacity'] == 3.5) ,['horsepower']] = 375.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2023) & (df['capacity'] == 3.5) ,['horsepower']] = 400.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2022) & (df['capacity'] == 5.0) ,['horsepower']] = 400.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2021) & (df['capacity'] == 2.7) ,['horsepower']] = 325.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2021) & (df['capacity'] == 3.5) ,['horsepower']] = 400.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2021) & (df['capacity'] == 5.0) ,['horsepower']] = 400.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'F-150 XLT') & (df['model_year'] == 2019) & (df['capacity'] == 3.5) ,['horsepower']] = 375.0
df.loc[(df['brand'] == 'Audi') & (df['model'] == 'Q5 S line Premium Plus') ,['horsepower']] = 261.0
df.loc[(df['brand'] == 'Ford') & (df['model'] == 'Explorer XLT') ,['horsepower']] = 300.0
df.loc[(df['brand'] == 'Kia') & (df['model'] == 'Telluride SX') ,['horsepower']] = 291.0
df.loc[(df['brand'] == 'RAM') & (df['model'] == '1500 Big Horn')  & (df['capacity'] == 5.7) ,['horsepower']] = 395.0
df.loc[(df['brand'] == 'RAM') & (df['model'] == '1500 Big Horn') & (df['capacity'] == 3.6) ,['horsepower']] = 305.0
df.loc[(df['brand'] == 'Jeep') & (df['model'] == 'Wrangler Sport')  & (df['capacity'] == 2.0) ,['horsepower']] = 270.0
df.loc[(df['brand'] == 'Jeep') & (df['model'] == 'Wrangler Sport') & (df['capacity'] == 3.8) ,['horsepower']] = 202.0
df.loc[(df['brand'] == 'Jeep') & (df['model'] == 'Grand Cherokee L Limited'),['horsepower']] = 290.0
df.loc[(df['brand'] == 'Mercedes-Benz') & (df['model'] == 'GLC 300 GLC 300'),['horsepower']] = 255.0
df.loc[(df['brand'] == 'Honda') & (df['model'] == 'Civic Sport'),['horsepower']] = 158.0
df.loc[(df['brand'] == 'Buick') & (df['model'] == 'Envision Essence'),['horsepower']] = 228.0
df.loc[(df['brand'] == 'BMW') & (df['model'] == 'X3 xDrive30i'),['horsepower']] = 248.0

In [30]:
df['log_price']= np.log(df['price']+1)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   brand                      4009 non-null   object 
 1   model                      4009 non-null   object 
 2   model_year                 4009 non-null   int64  
 3   fuel_type                  3974 non-null   object 
 4   engine                     4009 non-null   object 
 5   transmission               4009 non-null   object 
 6   ext_col                    4009 non-null   object 
 7   int_col                    4009 non-null   object 
 8   accident                   4009 non-null   int64  
 9   clean_title                4009 non-null   int64  
 10  price                      4009 non-null   float64
 11  years_used                 4009 non-null   int64  
 12  capacity                   3799 non-null   float64
 13  horsepower                 3290 non-null   float

In [32]:
df['capacity'].fillna(value= df['capacity'].mean(),inplace=True)
df['horsepower'].fillna(value= df['horsepower'].mean(),inplace=True)
df['cylinder_count'].fillna(value= df['cylinder_count'].mean(),inplace=True)
df['fuel_type'].fillna(value= 'Gasoline',inplace=True)

df['power_to_weight_ratio'] = df['horsepower'] / df['capacity']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['capacity'].fillna(value= df['capacity'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(value= df['horsepower'].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermedia

In [33]:
df['brand_frequency'] = df['brand'].map(df['brand'].value_counts())
df['model_frequency'] = df['model'].map(df['model'].value_counts())

In [34]:

# Example DataFrame

data = pd.DataFrame(df)

# Export the DataFrame to a CSV file
data.to_csv('output.csv', index=False)