In [2]:
import pandas as pd

# Load CSV file
df = pd.read_csv("co2.csv")

In [3]:
print(df.head())  # First few rows
print(df.info())  # Data types and missing values
print(df.describe())  # Summary statistics
print(df.isnull().sum())  # Count of missing values

    Make       Model Vehicle Class  Engine Size(L)  Cylinders Transmission  \
0  ACURA         ILX       COMPACT             2.0          4          AS5   
1  ACURA         ILX       COMPACT             2.4          4           M6   
2  ACURA  ILX HYBRID       COMPACT             1.5          4          AV7   
3  ACURA     MDX 4WD   SUV - SMALL             3.5          6          AS6   
4  ACURA     RDX AWD   SUV - SMALL             3.5          6          AS6   

  Fuel Type  Fuel Consumption City (L/100 km)  \
0         Z                               9.9   
1         Z                              11.2   
2         Z                               6.0   
3         Z                              12.7   
4         Z                              12.1   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
0                              6.7                               8.5   
1                              7.7                               9.6   
2                   

In [5]:
# Convert object columns to categorical
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].astype('category')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   Make                              7385 non-null   category
 1   Model                             7385 non-null   category
 2   Vehicle Class                     7385 non-null   category
 3   Engine Size(L)                    7385 non-null   float64 
 4   Cylinders                         7385 non-null   int64   
 5   Transmission                      7385 non-null   category
 6   Fuel Type                         7385 non-null   category
 7   Fuel Consumption City (L/100 km)  7385 non-null   float64 
 8   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64 
 9   Fuel Consumption Comb (L/100 km)  7385 non-null   float64 
 10  Fuel Consumption Comb (mpg)       7385 non-null   int64   
 11  CO2 Emissions(g/km)               7385 non-null   int64 

In [9]:
from sklearn.preprocessing import MinMaxScaler
# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# Numeric columns to normalize
columns_to_normalize = [
    "Engine Size(L)", "Cylinders", "Fuel Consumption City (L/100 km)",
    "Fuel Consumption Hwy (L/100 km)", "Fuel Consumption Comb (L/100 km)",
    "Fuel Consumption Comb (mpg)", "CO2 Emissions(g/km)"
]

# Apply normalization to each column
for col in columns_to_normalize:
    df[col] = scaler.fit_transform(df[[col]])

In [10]:
print(df.head())

    Make       Model Vehicle Class  Engine Size(L)  Cylinders Transmission  \
0  ACURA         ILX       COMPACT        0.146667   0.076923          AS5   
1  ACURA         ILX       COMPACT        0.200000   0.076923           M6   
2  ACURA  ILX HYBRID       COMPACT        0.080000   0.076923          AV7   
3  ACURA     MDX 4WD   SUV - SMALL        0.346667   0.230769          AS6   
4  ACURA     RDX AWD   SUV - SMALL        0.346667   0.230769          AS6   

  Fuel Type  Fuel Consumption City (L/100 km)  \
0         Z                          0.215909   
1         Z                          0.265152   
2         Z                          0.068182   
3         Z                          0.321970   
4         Z                          0.299242   

   Fuel Consumption Hwy (L/100 km)  Fuel Consumption Comb (L/100 km)  \
0                         0.162651                          0.200000   
1                         0.222892                          0.250000   
2                   

In [11]:
from sklearn.model_selection import train_test_split

# Split the dataset into 80% training and 20% testing
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Save the training and testing datasets
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)