In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.4.2'

In [3]:
data = {
    'Date': pd.date_range(start='2023-08-01', periods=10, freq='D'),
    'Flavor': ['Vanilla', 'Chocolate', 'Strawberry', 'Mint', 'Caramel', 'Cookies n Cream', 'Rocky Road', 'Mango', 'Pistachio', 'Blueberry'],
    'Units_Sold': [100, 120, 90, 105, 110, 85, 95, 115, 125, 105],
    'Profit': [200, 240, 180, 210, 220, 170, 190, 230, 250, 210]
}
bruams_df = pd.DataFrame(data)
bruams_df

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200
1,2023-08-02,Chocolate,120,240
2,2023-08-03,Strawberry,90,180
3,2023-08-04,Mint,105,210
4,2023-08-05,Caramel,110,220
5,2023-08-06,Cookies n Cream,85,170
6,2023-08-07,Rocky Road,95,190
7,2023-08-08,Mango,115,230
8,2023-08-09,Pistachio,125,250
9,2023-08-10,Blueberry,105,210


### Basic Interpolation:
#### You have some missing values in the Units_Sold column of the bruams_df DataFrame. How would you use linear interpolation to fill those gaps?

In [4]:
df_copy = bruams_df.copy()

In [5]:
df_copy.loc[1:3, ['Units_Sold']] = np.nan

In [6]:
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100.0,200
1,2023-08-02,Chocolate,,240
2,2023-08-03,Strawberry,,180
3,2023-08-04,Mint,,210
4,2023-08-05,Caramel,110.0,220
5,2023-08-06,Cookies n Cream,85.0,170
6,2023-08-07,Rocky Road,95.0,190
7,2023-08-08,Mango,115.0,230
8,2023-08-09,Pistachio,125.0,250
9,2023-08-10,Blueberry,105.0,210


In [7]:
bruams_df.loc[:, ['Units_Sold']].interpolate(method='linear')

Unnamed: 0,Units_Sold
0,100
1,120
2,90
3,105
4,110
5,85
6,95
7,115
8,125
9,105


In [8]:
bruams_df.loc[:, ['Units_Sold']].interpolate()

Unnamed: 0,Units_Sold
0,100
1,120
2,90
3,105
4,110
5,85
6,95
7,115
8,125
9,105


### Time-based Interpolation:
#### Assuming the Date column is set as the DataFrame index and you have missing values in the Profit column, how would you use time-based interpolation to estimate the missing profit values?

In [9]:
df_copy.loc[1:3, ['Profit']] = np.nan

In [10]:
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100.0,200.0
1,2023-08-02,Chocolate,,
2,2023-08-03,Strawberry,,
3,2023-08-04,Mint,,
4,2023-08-05,Caramel,110.0,220.0
5,2023-08-06,Cookies n Cream,85.0,170.0
6,2023-08-07,Rocky Road,95.0,190.0
7,2023-08-08,Mango,115.0,230.0
8,2023-08-09,Pistachio,125.0,250.0
9,2023-08-10,Blueberry,105.0,210.0


In [11]:
df_copy.set_index('Date', inplace=True)

In [12]:
df_copy.loc[:, ['Profit']].interpolate(method='time')

Unnamed: 0_level_0,Profit
Date,Unnamed: 1_level_1
2023-08-01,200.0
2023-08-02,205.0
2023-08-03,210.0
2023-08-04,215.0
2023-08-05,220.0
2023-08-06,170.0
2023-08-07,190.0
2023-08-08,230.0
2023-08-09,250.0
2023-08-10,210.0


### Limit Interpolation:
#### If you want to interpolate missing values in the Units_Sold column but only for a maximum of 2 consecutive missing entries, how would you achieve this?

In [13]:
df_copy

Unnamed: 0_level_0,Flavor,Units_Sold,Profit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-08-01,Vanilla,100.0,200.0
2023-08-02,Chocolate,,
2023-08-03,Strawberry,,
2023-08-04,Mint,,
2023-08-05,Caramel,110.0,220.0
2023-08-06,Cookies n Cream,85.0,170.0
2023-08-07,Rocky Road,95.0,190.0
2023-08-08,Mango,115.0,230.0
2023-08-09,Pistachio,125.0,250.0
2023-08-10,Blueberry,105.0,210.0


In [14]:
df_copy['Profit'].interpolate(limit=2)

Date
2023-08-01    200.0
2023-08-02    205.0
2023-08-03    210.0
2023-08-04      NaN
2023-08-05    220.0
2023-08-06    170.0
2023-08-07    190.0
2023-08-08    230.0
2023-08-09    250.0
2023-08-10    210.0
Name: Profit, dtype: float64

### Direction of Interpolation:
#### How can you perform interpolation in the Profit column but only in a backward direction?

In [15]:
df_copy.iloc[-1, 2] = np.nan # assign last column value to nan

In [16]:
df_copy

Unnamed: 0_level_0,Flavor,Units_Sold,Profit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-08-01,Vanilla,100.0,200.0
2023-08-02,Chocolate,,
2023-08-03,Strawberry,,
2023-08-04,Mint,,
2023-08-05,Caramel,110.0,220.0
2023-08-06,Cookies n Cream,85.0,170.0
2023-08-07,Rocky Road,95.0,190.0
2023-08-08,Mango,115.0,230.0
2023-08-09,Pistachio,125.0,250.0
2023-08-10,Blueberry,105.0,


In [17]:
df_copy['Profit'].interpolate(limit_direction='backward')

Date
2023-08-01    200.0
2023-08-02    205.0
2023-08-03    210.0
2023-08-04    215.0
2023-08-05    220.0
2023-08-06    170.0
2023-08-07    190.0
2023-08-08    230.0
2023-08-09    250.0
2023-08-10      NaN
Name: Profit, dtype: float64

### Polynomial Interpolation:
#### You decide that a polynomial interpolation of order 2 might fit the Units_Sold data better. How would you apply such an interpolation method to the column?

In [18]:
df_copy = bruams_df.copy()

In [19]:
df_copy.loc[1:3, 'Units_Sold'] = np.nan

In [20]:
df_copy.loc[:, 'Units_Sold']

0    100.0
1      NaN
2      NaN
3      NaN
4    110.0
5     85.0
6     95.0
7    115.0
8    125.0
9    105.0
Name: Units_Sold, dtype: float64

In [21]:
df_copy['Units_Sold'].interpolate(method='polynomial', order=2)

0    100.000000
1    123.648010
2    133.197346
3    128.648010
4    110.000000
5     85.000000
6     95.000000
7    115.000000
8    125.000000
9    105.000000
Name: Units_Sold, dtype: float64

### Handling Outliers with Interpolation:
#### Before interpolating the Units_Sold column, you want to replace any values above 150 as NaNs (considering them as outliers). How would you replace these outliers and then interpolate the resulting missing values?

In [22]:
df_copy.loc[:, 'Units_Sold'].interpolate(method='pad')

0    100.0
1    100.0
2    100.0
3    100.0
4    110.0
5     85.0
6     95.0
7    115.0
8    125.0
9    105.0
Name: Units_Sold, dtype: float64

### Different Methods:
#### The Profit column has missing values, and you're not sure which interpolation method to use. How would you interpolate using the 'nearest' method?

In [24]:
### first adding some missing values to test the "nearest" method

In [23]:
df_copy = bruams_df.copy()

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200
1,2023-08-02,Chocolate,120,240
2,2023-08-03,Strawberry,90,180
3,2023-08-04,Mint,105,210
4,2023-08-05,Caramel,110,220
5,2023-08-06,Cookies n Cream,85,170
6,2023-08-07,Rocky Road,95,190
7,2023-08-08,Mango,115,230
8,2023-08-09,Pistachio,125,250
9,2023-08-10,Blueberry,105,210


In [26]:
df_copy.loc[1:3, ['Profit']] = np.nan

In [27]:
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100.0,200.0
1,2023-08-02,Chocolate,,
2,2023-08-03,Strawberry,,
3,2023-08-04,Mint,,
4,2023-08-05,Caramel,110.0,220.0
5,2023-08-06,Cookies n Cream,85.0,170.0
6,2023-08-07,Rocky Road,95.0,190.0
7,2023-08-08,Mango,115.0,230.0
8,2023-08-09,Pistachio,125.0,250.0
9,2023-08-10,Blueberry,105.0,210.0


In [29]:
df_copy['Profit'] = df_copy['Profit'].interpolate(method='nearest')

In [30]:
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100.0,200.0
1,2023-08-02,Chocolate,,200.0
2,2023-08-03,Strawberry,,200.0
3,2023-08-04,Mint,,220.0
4,2023-08-05,Caramel,110.0,220.0
5,2023-08-06,Cookies n Cream,85.0,170.0
6,2023-08-07,Rocky Road,95.0,190.0
7,2023-08-08,Mango,115.0,230.0
8,2023-08-09,Pistachio,125.0,250.0
9,2023-08-10,Blueberry,105.0,210.0


### Interpolation with Axis:
#### If your DataFrame has rows as dates and columns representing different products, and you want to interpolate vertically (along columns), how would you specify the axis for interpolation?

In [None]:
# TBD

### Interpolate at Specific Points:
#### If you have a list of specific positions where you want to interpolate in the Units_Sold column, how would you achieve interpolation only at those points?

In [38]:
df_copy = bruams_df.copy()
df_copy.loc[1:3, ['Profit']] = np.nan
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200.0
1,2023-08-02,Chocolate,120,
2,2023-08-03,Strawberry,90,
3,2023-08-04,Mint,105,
4,2023-08-05,Caramel,110,220.0
5,2023-08-06,Cookies n Cream,85,170.0
6,2023-08-07,Rocky Road,95,190.0
7,2023-08-08,Mango,115,230.0
8,2023-08-09,Pistachio,125,250.0
9,2023-08-10,Blueberry,105,210.0


In [39]:
df_copy['Profit'] = df_copy['Profit'].interpolate(method='values')
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200.0
1,2023-08-02,Chocolate,120,205.0
2,2023-08-03,Strawberry,90,210.0
3,2023-08-04,Mint,105,215.0
4,2023-08-05,Caramel,110,220.0
5,2023-08-06,Cookies n Cream,85,170.0
6,2023-08-07,Rocky Road,95,190.0
7,2023-08-08,Mango,115,230.0
8,2023-08-09,Pistachio,125,250.0
9,2023-08-10,Blueberry,105,210.0


### Check Post Interpolation:
#### After interpolating the Profit column, how would you verify if there are still any missing values in that column?

In [45]:
df_copy = bruams_df.copy()
df_copy.loc[1:3, ['Profit']] = np.nan
df_copy.loc[9, 'Profit'] = np.nan
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200.0
1,2023-08-02,Chocolate,120,
2,2023-08-03,Strawberry,90,
3,2023-08-04,Mint,105,
4,2023-08-05,Caramel,110,220.0
5,2023-08-06,Cookies n Cream,85,170.0
6,2023-08-07,Rocky Road,95,190.0
7,2023-08-08,Mango,115,230.0
8,2023-08-09,Pistachio,125,250.0
9,2023-08-10,Blueberry,105,


In [46]:
df_copy['Profit'] = df_copy['Profit'].interpolate(method='nearest', limit_direction='backward')
df_copy

Unnamed: 0,Date,Flavor,Units_Sold,Profit
0,2023-08-01,Vanilla,100,200.0
1,2023-08-02,Chocolate,120,200.0
2,2023-08-03,Strawberry,90,200.0
3,2023-08-04,Mint,105,220.0
4,2023-08-05,Caramel,110,220.0
5,2023-08-06,Cookies n Cream,85,170.0
6,2023-08-07,Rocky Road,95,190.0
7,2023-08-08,Mango,115,230.0
8,2023-08-09,Pistachio,125,250.0
9,2023-08-10,Blueberry,105,


In [51]:
df_copy['Profit'].isna().any()

True