### Handling Missing Values

In [2]:
import pandas as pd
data = {
    'Date': ['2024-01-01', '2024-01-02', '2024-01-04', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-12'],
    'Price': [100,101,103,106,107,108,109,111]

}


## creating a dataframe of this
data = pd.DataFrame(data)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data

Unnamed: 0_level_0,Price
Date,Unnamed: 1_level_1
2024-01-01,100
2024-01-02,101
2024-01-04,103
2024-01-07,106
2024-01-08,107
2024-01-09,108
2024-01-10,109
2024-01-12,111


In [3]:
## creating a complete date range
comple_index = pd.date_range(start='2024-01-01', end='2024-01-12', freq='D')


# Reindex the DataFrame
data = data.reindex(comple_index)

print("Reindexed DataFrame with Missing  Dates", data)

Reindexed DataFrame with Missing  Dates             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03    NaN
2024-01-04  103.0
2024-01-05    NaN
2024-01-06    NaN
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11    NaN
2024-01-12  111.0


### imputation

In [5]:
## mean imputation
mean_value = data.mean()
data_mean_imputed = data.fillna(mean_value)


## median imputation
median_value = data.median()
data_median_imputed = data.fillna(median_value)


## mode imputation
mode_value = data.mode()
data_mode_imputed = data.fillna(mode_value.values[0][0])


## forward fill
data_ffill = data.fillna(method='ffill')


## backward fill
data_bfill = data.fillna(method='bfill')

print("Mean Imputed DataFrame", data_mean_imputed)
print("Median Imputed DataFrame", data_median_imputed   )
print("Mode Imputed DataFrame", data_mode_imputed)
print("Forward Fill DataFrame", data_ffill)
print("Backward Fill DataFrame", data_bfill)

Mean Imputed DataFrame               Price
2024-01-01  100.000
2024-01-02  101.000
2024-01-03  105.625
2024-01-04  103.000
2024-01-05  105.625
2024-01-06  105.625
2024-01-07  106.000
2024-01-08  107.000
2024-01-09  108.000
2024-01-10  109.000
2024-01-11  105.625
2024-01-12  111.000
Median Imputed DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  106.5
2024-01-04  103.0
2024-01-05  106.5
2024-01-06  106.5
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  106.5
2024-01-12  111.0
Mode Imputed DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  100.0
2024-01-04  103.0
2024-01-05  100.0
2024-01-06  100.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  100.0
2024-01-12  111.0
Forward Fill DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  101.0
2024-01-04  103.0
2024-01-05  103.0
2024-01-06  103.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.

  data_ffill = data.fillna(method='ffill')
  data_bfill = data.fillna(method='bfill')


### Interpolation

In [6]:
## Linear Interpolation
data_linear_interp = data.interpolate(method='linear')


## polynomial interpolation (degree 2)
data_poly_interp  = data.interpolate(method='polynomial', order=2)


## spline interpolation (order 2)
data_spline_interp = data.interpolate(method='spline', order=2)


print("Linear Interpolation DataFrame", data_linear_interp)
print("Polynomial Interpolation DataFrame", data_poly_interp)
print("Spline Interpolation DataFrame", data_spline_interp)

Linear Interpolation DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  105.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  110.0
2024-01-12  111.0
Polynomial Interpolation DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  105.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  110.0
2024-01-12  111.0
Spline Interpolation DataFrame             Price
2024-01-01  100.0
2024-01-02  101.0
2024-01-03  102.0
2024-01-04  103.0
2024-01-05  104.0
2024-01-06  105.0
2024-01-07  106.0
2024-01-08  107.0
2024-01-09  108.0
2024-01-10  109.0
2024-01-11  110.0
2024-01-12  111.0


### Predictive Modelling

In [7]:
data['Sales']  = [700,900,1000,2000,1500,3000,2300,4000,5900,6000,65000,6000]
data

Unnamed: 0,Price,Sales
2024-01-01,100.0,700
2024-01-02,101.0,900
2024-01-03,,1000
2024-01-04,103.0,2000
2024-01-05,,1500
2024-01-06,,3000
2024-01-07,106.0,2300
2024-01-08,107.0,4000
2024-01-09,108.0,5900
2024-01-10,109.0,6000


In [8]:
from sklearn.linear_model import LinearRegression


known_data = data.dropna(subset=['Price'])
missing_data = data[data['Price'].isna()]


## train a model on known data
model = LinearRegression()
model.fit(known_data[['Sales']], known_data['Price'])


## predict the missing values
predicted_values = model.predict(missing_data[['Sales']])
data.loc[data['Price'].isna(), 'Price'] = predicted_values

print("Data with Predicted Values", data)


Data with Predicted Values                  Price  Sales
2024-01-01  100.000000    700
2024-01-02  101.000000    900
2024-01-03  101.621076   1000
2024-01-04  103.000000   2000
2024-01-05  102.429950   1500
2024-01-06  104.856570   3000
2024-01-07  106.000000   2300
2024-01-08  107.000000   4000
2024-01-09  108.000000   5900
2024-01-10  109.000000   6000
2024-01-11  205.156883  65000
2024-01-12  111.000000   6000


### predicted values returns 4 values...(for nan values)...

### end_here!