In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [None]:
df=pd.read_csv('datasets/fuel_consumption.csv')
print(df)

In [None]:
# Select features (X) and target (y)
x = df[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]]
y = df["CO2EMISSIONS"]

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Linear Regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Predictions
y_pred = model.predict(x_test)

# Metrics
print("Model Performance Metrics:")
print("RÂ² Score:", r2_score(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))

# Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color="blue", label="Predicted vs Actual")
plt.plot(
    [y.min(), y.max()],
    [y.min(), y.max()],
    color="red",
    linestyle="--",
    label="Perfect Prediction",
)
plt.title("Actual vs Predicted CO2 Emissions")
plt.xlabel("Actual CO2 Emissions")
plt.ylabel("Predicted CO2 Emissions")
plt.legend()
plt.grid(linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Create binary target: High (1) if CO2 > mean, else Low (0)
df["HighEmission"] = (df["CO2EMISSIONS"] > df["CO2EMISSIONS"].mean()).astype(int)

# Features and target
x = df[["ENGINESIZE", "CYLINDERS", "FUELCONSUMPTION_COMB"]]
y = df["HighEmission"]

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Logistic Regression model
model = LogisticRegression(max_iter=2000)
model.fit(x_train, y_train)

# Predictions
y_pred = model.predict(x_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

# Detailed classification report
print("Classification Report:" + classification_report(y_test, y_pred))


# Plot Actual vs Predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color="blue", label="Predicted vs Actual")
plt.plot(
    [y.min(), y.max()],
    [y.min(), y.max()],
    color="red",
    linestyle="--",
    label="Perfect Prediction",
)
plt.title("Actual vs Predicted CO2 Emissions")
plt.xlabel("Actual CO2 Emissions")
plt.ylabel("Predicted CO2 Emissions")
plt.legend()
plt.grid(linestyle="--", alpha=0.7)                                        
plt.show()

In [None]:
# Display the first 10 rows of the dataset

df.head(10)

In [None]:
# Show all unique vehicle classes

df['VEHICLECLASS'].unique()

In [None]:
# Find the average CO2 emissions grouped by fuel type

df.groupby('FUELTYPE')['CO2EMISSIONS'].mean()

In [None]:
# List all cars from the year 2018

df[df['MODELYEAR'] == 2018]

In [None]:
# Show top 5 makes with the highest average engine size

df.groupby('MAKE')['ENGINESIZE'].mean().nlargest(5)
#df.groupby('MAKE')['ENGINESIZE'].mean().sort_values(ascending=False).head(5)

In [None]:
# Count the number of cars per cylinder type

df['CYLINDERS'].value_counts()

In [None]:
# Filter cars where CO2 emissions are greater than 300

df[df['CO2EMISSIONS'] > 300]

In [None]:
# Find the vehicle with the maximum highway fuel consumption

df.nlargest(1,"FUELCONSUMPTION_HWY")[["MAKE","MODEL","VEHICLECLASS","FUELCONSUMPTION_HWY"]]

In [None]:
# Group by make and model, and find the average city fuel consumption

df.groupby(['MAKE', 'MODEL'])['FUELCONSUMPTION_CITY'].mean()

In [None]:
# Find the correlation between engine size and CO2 emissions

df['ENGINESIZE'].corr(df['CO2EMISSIONS'])