In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [3]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


### Q1. Pandas version

In [4]:
pd.__version__

'2.3.1'

### Q2. Records count

In [5]:
df.count()

engine_displacement    9704
num_cylinders          9222
horsepower             8996
vehicle_weight         9704
acceleration           8774
model_year             9704
origin                 9704
fuel_type              9704
drivetrain             9704
num_doors              9202
fuel_efficiency_mpg    9704
dtype: int64

## Q3. Fuel types

In [6]:
df['fuel_type'].nunique()

2

### Q4. Missing values

In [7]:
# Number of columns with at least one missing value
missing_cols = df.isnull().any().sum()

print(missing_cols)


4


### Q5. Max fuel efficiency

In [8]:
# Filter only cars from Asia and get the maximum mpg
max_efficiency = df.loc[df["origin"] == "Asia", "fuel_efficiency_mpg"].max()

print(max_efficiency)


23.759122836520497


In [9]:
# Alternate Way
# Max fuel efficiency per origin
df.groupby("origin")["fuel_efficiency_mpg"].max()


origin
Asia      23.759123
Europe    25.967222
USA       24.971452
Name: fuel_efficiency_mpg, dtype: float64

### Q6. Median value of horsepower

In [10]:
#Step 1: Median value of horsepower
median_hp_before = df["horsepower"].median()
print("Median before filling:", median_hp_before)


Median before filling: 149.0


In [11]:
#Step 2: Most frequent value (mode) of horsepower
most_freq_hp = df["horsepower"].mode()[0]
print("Most frequent value (mode):", most_freq_hp)


Most frequent value (mode): 152.0


In [12]:
#Step 3: Fill missing values with mode
df["horsepower"].fillna(most_freq_hp, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["horsepower"].fillna(most_freq_hp, inplace=True)


In [13]:
#Step 4: Median again after filling
median_hp_after = df["horsepower"].median()
print("Median after filling:", median_hp_after)


Median after filling: 152.0



### Q7. Sum of weights

In [18]:
import numpy as np

# 1) Filter Asia, 2) select columns, 3) first 7 rows, 4) get NumPy array X
X = df.loc[df["origin"] == "Asia", ["vehicle_weight", "model_year"]].head(7).to_numpy()

# 5) XTX = X.T @ X
XTX = X.T @ X

# 6) Invert XTX (fall back to pseudo-inverse if needed)
try:
    XTX_inv = np.linalg.inv(XTX)
except np.linalg.LinAlgError:
    XTX_inv = np.linalg.pinv(XTX)

# 7) y as given
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

# 8) w = (XTX)^(-1) @ X.T @ y
w = XTX_inv @ X.T @ y

# Sum of all elements in w
print("Sum of w:", float(w.sum()))


Sum of w: 0.5187709081074016
