In [3]:
!python -V

Python 3.13.5


In [67]:
import pandas as pd
import numpy as np

## Question 1. Pandas Version

In [5]:
pd.__version__

'2.2.3'

In [9]:
# Fetching the data
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-09-29 13:28:58--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.1’


2025-09-29 13:28:58 (170 MB/s) - ‘car_fuel_efficiency.csv.1’ saved [874188/874188]



In [8]:
# Loading the dataframe
df = pd.read_csv('car_fuel_efficiency.csv')

## Question 2. Records Count

In [13]:
# Getting the number of records
records_count = df.shape[0]

# Printing the records count
print(f"The number of records in the DataFrame is: {records_count}")

The number of records in the DataFrame is: 9704


## Question 3. Fuel Types

In [20]:
# Fetching the fuel types count
fuel_type_counts = df['fuel_type'].nunique()

# Fetching the fuel types name
fuel_types = df['fuel_type'].unique()

# Printing the count
print(f"There are {fuel_type_counts} types of fuel and they are {fuel_types[0]} and {fuel_types[1]}")

There are 2 types of fuel and they are Gasoline and Diesel


## Question 4. Missing Values

In [52]:
# Calculating the number of missing values for each column
missing_values = df.isnull().sum()

# Filter the Series to include only counts > 0
columns_with_missing_data = missing_values[missing_values > 0]
column_names = columns_with_missing_data.index.tolist()

# Printing the number of columns that have missing values
print(f"Number of columns that have missing values are {columns_with_missing_data.size} and the columns are as follows:\n")

for i in range(len(column_names)):
    print(f"{i+1}.{column_names[i]}")

Number of columns that have missing values are 4 and the columns are as follows:

1.num_cylinders
2.horsepower
3.acceleration
4.num_doors


## Question 5. Max Fuel Efficiency

In [66]:
# Getting the max value from the fuel_efficiency column
max_asia_fuel_efficiency = df[df['origin'] == 'Asia']['fuel_efficiency_mpg'].max()

# Printing the max value with 2 decimal values
print(f"Maximum fuel efficiency of asian cars is {max_asia_fuel_efficiency:.2f}")

Maximum fuel efficiency is 23.76


## Question 6. Median Value of Horsepower

In [65]:
# 1. Calculate the initial median
initial_median_hp = df['horsepower'].median()

# 2. Calculate the most frequent value (mode)
# We use .iloc[0] because mode can return multiple values if there's a tie
mode_hp = df['horsepower'].mode().iloc[0]

# 3. Fill missing values with the mode
# We create a copy of the column to avoid modifying the main DataFrame for Q7
hp_imputed = df['horsepower'].fillna(mode_hp)

# 4. Calculate the new median
new_median_hp = hp_imputed.median()

# 5. Compare and determine the answer
changed = "No"
if new_median_hp > initial_median_hp:
    changed = "Yes, it increased"
elif new_median_hp < initial_median_hp:
    changed = "Yes, it decreased"

print(f"Initial Median Horsepower: {initial_median_hp}")
print(f"Mode Horsepower used for imputation: {mode_hp}")
print(f"New Median Horsepower after imputation: {new_median_hp}")
print(f"Has the median changed? {changed}\n")

Initial Median Horsepower: 149.0
Mode Horsepower used for imputation: 152.0
New Median Horsepower after imputation: 152.0
Has the median changed? Yes, it increased



## Question 7. Sum of Weights

In [71]:
from numpy.linalg import inv

# 1. Select all the cars from Asia
df_asia = df[df['origin'] == 'Asia']

# 2. Select only columns vehicle_weight and model_year
df_subset = df_asia[['vehicle_weight', 'model_year']]

# 3. Select the first 7 values
df_subset = df_subset.head(7)

# 4. Get the underlying NumPy array, X
X = df_subset.to_numpy() # X is 7x2

# 5. Compute matrix-matrix multiplication between the transpose of X and X (XTX)
XT = X.T # 2x7
XTX = XT @ X # 2x2

# 6. Invert XTX
try:
    XTX_inv = inv(XTX)
except np.linalg.LinAlgError:
    print("Error: Matrix XTX is singular and cannot be inverted.")
    XTX_inv = None # Should not happen for a well-conditioned matrix

# 7. Create array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200]) # 7x1

# 8. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
if XTX_inv is not None:
    # w = (XTX_inv @ XT) @ y
    w = XTX_inv @ XT @ y
    
    # 9. What's the sum of all the elements of the result?
    sum_w = w.sum()
    
    print(f"The vector w (results of the matrix operation) is: {w}")
    print(f"Sum of all elements of w: {sum_w:.2f}")

else:
    sum_w = "Not calculated due to singular matrix error"
    print(sum_w)

The vector w (results of the matrix operation) is: [0.01386421 0.5049067 ]
Sum of all elements of w: 0.52
