In [4]:
import pandas as pd
import numpy as np

In [5]:
pd.__version__

'2.3.3'

### Getting the data

In [11]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [12]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [13]:
df.tail()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.52739,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551
9703,270,3.0,140.0,2908.043477,14.7,2005,Asia,Diesel,All-wheel drive,-1.0,14.884467


### Q2. Records count

In [14]:
total_records = len(df)
print(total_records)

9704


### Q3. Fuel types

In [15]:
num_fuel_types = df['fuel_type'].nunique()
print(num_fuel_types)

2


### Q4. Missing values

In [16]:
missing_values_per_column = df.isnull().sum()
columns_with_missing_values = (missing_values_per_column > 0).sum()
print(columns_with_missing_values)

4


### Q5. Max fuel efficiency of cars from Asia

In [17]:
asian_cars_df = df[df['origin'] == 'Asia']
max_efficiency_asia = asian_cars_df['fuel_efficiency_mpg'].max()
print(int(max_efficiency_asia * 100) / 100.0)

23.75


### Q6. Median value of horsepower

In [18]:
# Step 1: Find the initial median value of the 'horsepower' column.
initial_median_hp = df['horsepower'].median()
print(initial_median_hp)

149.0


In [19]:
# Step 2: Calculate the most frequent value (mode) of the 'horsepower' column.
mode_hp = df['horsepower'].mode()[0]
print(mode_hp)

152.0


In [20]:
# Step 3: Use fillna() to fill missing values in 'horsepower' column with the mode.
df['horsepower_imputed'] = df['horsepower'].fillna(mode_hp)

In [21]:
# Step 4: Calculate the median value of the *imputed* horsepower column once again.
final_median_hp = df['horsepower_imputed'].median()
print(final_median_hp)

152.0


### Has it changed?
##### Yes, it increased from 149 to 152

### Q7. Sum of weights

In [22]:
# 1. Select all the cars from Asia
df_asia = df[df['origin'] == 'Asia']
print(len(df_asia))

3247


In [23]:
# 2. Select only columns vehicle_weight and model_year
df_select = df_asia[['vehicle_weight', 'model_year']]
print(df_select)

      vehicle_weight  model_year
8        2714.219310        2016
12       2783.868974        2010
14       3582.687368        2007
20       2231.808142        2011
21       2659.431451        2016
...              ...         ...
9688     3948.404625        2018
9692     3680.341381        2016
9693     2545.070139        2012
9698     3107.427820        2005
9703     2908.043477        2005

[3247 rows x 2 columns]


In [24]:
# 3. Select the first 7 values
df_first_7 = df_select.head(7)
print(df_first_7)

    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019


In [25]:
# 4. Get the underlying NumPy array. Let's call it X.
X = df_first_7.to_numpy()
print(X.shape)

(7, 2)


In [26]:
# 5. Compute matrix-matrix multiplication between the transpose of X and X. Let's call the result XTX.
# X.T gives the transpose of X. The @ symbol is the NumPy operator for matrix multiplication.
XTX = X.T @ X
print(XTX)

[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]


In [27]:
# 6. Invert XTX.
# Use np.linalg.inv() for matrix inversion.
XTX_inv = np.linalg.inv(XTX)
print(XTX_inv)

[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]


In [28]:
# 7. Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(y)

[1100 1300  800  900 1000 1100 1200]


In [29]:
# 8. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
intermediate_result = XTX_inv @ X.T
w = intermediate_result @ y
print(w)

[0.01386421 0.5049067 ]


In [30]:
# 9. What's the sum of all the elements of the result?
sum_w = w.sum()
print(int(sum_w * 100) / 100.0)

0.51
