In [35]:
# import pandas as pd
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt
import os


In [85]:
d = pd.read_csv(r'C:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Bangalore_1990_2022_BangaloreCity.csv')

In [86]:
sum(d['tmax'].isna())

629

In [87]:
d['time'] = pd.to_datetime(d['time'],errors='raise')

  d['time'] = pd.to_datetime(d['time'],errors='raise')


In [88]:
d

Unnamed: 0,time,tavg,tmin,tmax,prcp
0,1990-01-01,22.9,19.1,28.4,
1,1990-02-01,21.7,,26.5,0.0
2,1990-03-01,21.0,16.4,26.5,0.0
3,1990-04-01,20.8,,27.4,0.0
4,1990-05-01,20.4,14.2,26.1,0.0
...,...,...,...,...,...
11889,2022-07-21,23.7,20.5,30.8,82.5
11890,2022-07-22,23.2,21.1,27.9,0.0
11891,2022-07-23,23.1,20.9,26.7,0.0
11892,2022-07-24,22.8,20.0,26.7,0.3


In [89]:
d.set_index('time', inplace=True)


In [79]:
def fill_missing_with_decomposition(df, col):
    # Interpolate missing values
    df_interpolated = df[col].interpolate(method='linear')
    
    # Apply seasonal decomposition
    decomposition = seasonal_decompose(df_interpolated, model='additive', period=365)
    seasonal = decomposition.seasonal
    trend = decomposition.trend
    residual = decomposition.resid

    # Some values in trend and residual can still be NaN, especially at the boundaries
    trend.interpolate(inplace=True)  # Interpolating the trend to handle NaNs at the boundaries
    residual.fillna(residual.mean(), inplace=True)  # Filling residuals' NaNs with their mean

    # Reconstruct the series with imputed values
    df[f'{col}_imputed'] = trend + seasonal + residual


# Assuming continuous blocks of missing data might be long and SARIMA is needed
def sarima_forecast_imputation(df,col):
    # Fill missing values with NaN placeholder for SARIMA to recognize as missing
    df[col].fillna(np.nan, inplace=True)
    
    # Fit SARIMA model
    # Note: The order and seasonal_order parameters might need to be adjusted based on your data
    model = SARIMAX(df[col], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12), 
                    enforce_stationarity=False, enforce_invertibility=False)
    results = model.fit()
    
    # Forecast the missing values
    forecast = results.predict(start=df.index[0], end=df.index[-1], dynamic=True)
    
    # Impute missing values with forecasted values
    df[f'{col}_imputed_sarima'] = df[col].fillna(forecast)

def interpolate_and_sarima(df, col_name):
    # Ensure the column is float for np.nan compatibility
    df[col_name] = df[col_name].astype(float)

    # Interpolate to fill small gaps directly
    df_interpolated = df.copy()
    df_interpolated[col_name] = df[col_name].interpolate(method='time')

    # Placeholder for imputed series, starting with interpolated values
    df_imputed = df_interpolated.copy()

    # Identify continuous NaN blocks
    missing = df[col_name].isna()
    blocks = missing.ne(missing.shift()).cumsum()[missing]

    for block in blocks.unique():
        start, end = blocks.index[blocks == block][[0, -1]]
        gap_size = (end - start).days

        # Decide to use SARIMA based on the gap size (e.g., use SARIMA for gaps larger than a threshold)
        if gap_size > 30:  # Example threshold: 30 days
            # Define the SARIMA modeling window
            window_start = df.index.get_loc(start) - 15  # 15 days before the gap
            window_end = df.index.get_loc(end) + 15  # 15 days after the gap

            # Ensure window limits are within bounds
            window_start = max(0, window_start)
            window_end = min(len(df) - 1, window_end)

            # Select the series around the gap for modeling
            train_series = df_imputed[col_name].iloc[window_start:window_end]

            # Fit SARIMA
            model = SARIMAX(train_series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                            enforce_stationarity=False, enforce_invertibility=False)
            model_fit = model.fit(disp=False)
            
            # Forecast the gap
            forecast_start = df.index.get_loc(start)
            forecast_end = df.index.get_loc(end)
            forecast = model_fit.predict(start=forecast_start, end=forecast_end)
            
            # Impute the forecasted values
            df_imputed[col_name].iloc[forecast_start:forecast_end+1] = forecast

    return df_imputed



In [90]:
def valid_sarima(train_series, order, seasonal_order):
    try:
        model = SARIMAX(train_series, order=order, seasonal_order=seasonal_order,
                        enforce_stationarity=False, enforce_invertibility=False)
        model_fit = model.fit(disp=False)
        # Check if the model has converged
        if not model_fit.mle_retvals['converged']:
            raise ValueError("SARIMA model failed to converge")
        # Check for extreme values in the fitted parameters
        if np.any(np.abs(model_fit.params) > 1e5):
            raise ValueError("SARIMA model has extreme parameter values")
        return model_fit
    except Exception as e:
        print(f"SARIMA model validation error: {e}")
        return None

# Function to robustly impute the data
def robust_interpolate_and_sarima(df, col_name):
    # Define bounds for temperature data
    lower_bound, upper_bound = -100, 60
    
    # Start with interpolation for small gaps
    df_interpolated = df.copy()
    df_interpolated[col_name] = df[col_name].interpolate(method='linear')
    
    # Identify remaining missing values
    missing = df_interpolated[col_name].isnull()
    missing_blocks = missing.ne(missing.shift()).cumsum()[missing]

    # Impute large gaps with SARIMA
    for block in missing_blocks.unique():
        start, end = missing_blocks.index[missing_blocks == block][[0, -1]]
        gap_size = (end - start).days
        
        if gap_size > 30:  # Use SARIMA for large gaps
            window_start = start - pd.Timedelta(days=30)
            window_end = end + pd.Timedelta(days=30)
            window_start, window_end = max(df.index[0], window_start), min(df.index[-1], window_end)
            train_series = df_interpolated[col_name][window_start:window_end]

            # Validate SARIMA model before applying
            model_fit = valid_sarima(train_series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
            if model_fit:
                forecast = model_fit.predict(start=start, end=end)
                df_interpolated[col_name][start:end] = forecast
            else:
                print(f"Skipping SARIMA imputation for block starting at {start} due to validation failure.")
    
    # Post-imputation sanity check to ensure values are within plausible bounds
    df_interpolated[col_name] = np.clip(df_interpolated[col_name], lower_bound, upper_bound)
    
    return df_interpolated



In [91]:

# Assume 'df' is your DataFrame with a datetime index and 'temperature' is the column with missing values
df_imputed = robust_interpolate_and_sarima(d, 'tmax')


In [94]:
df_imputed['tmax'].isna().sum()

0

In [96]:
df_imputed[df_imputed['tmax'] <= 0].shape

(0, 4)

In [80]:

col = 'tmax'
# First, attempt to fill missing values using decomposition
# fill_missing_with_decomposition(d,col)

# Check if there are still missing values, and use SARIMA if necessary
# if d[f'{col}_imputed'].isnull().sum() > 0:
d = interpolate_and_sarima(d,col)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

In [82]:
sum(d['tmax'].isna())

0

In [83]:
d.to_csv('chk.csv')

In [34]:
print(d.shape,d.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11894 entries, 0 to 11893
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   time                 11894 non-null  datetime64[ns]
 1   tavg                 11824 non-null  float64       
 2   tmin                 10505 non-null  float64       
 3   tmax                 11265 non-null  float64       
 4   prcp                 7274 non-null   float64       
 5   tmax_imputed         11712 non-null  float64       
 6   tmax_imputed_sarima  11894 non-null  float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 650.6 KB
(11894, 7) None


In [98]:
for i in os.walk('./Temperature_And_Precipitation_Cities_IN'):
    print(i)

('./Temperature_And_Precipitation_Cities_IN', [], ['Bangalore_1990_2022_BangaloreCity.csv', 'Chennai_1990_2022_Madras.csv', 'Delhi_NCR_1990_2022_Safdarjung.csv', 'Lucknow_1990_2022.csv', 'Mumbai_1990_2022_Santacruz.csv', 'Rajasthan_1990_2022_Jodhpur.csv', 'Station_GeoLocation_Longitute_Latitude_Elevation_EPSG_4326.csv', 'weather_Bhubhneshwar_1990_2022.csv', 'weather_Rourkela_2021_2022.csv'])


In [99]:
path = list(os.walk('./Temperature_And_Precipitation_Cities_IN'))

In [100]:
cities = path[0][2]

In [102]:
for i in cities:
    path = os.path.join(os.getcwd(),'Temperature_And_Precipitation_Cities_IN',i)
    print(path)
    d = pd.read_csv(path)
    try:
        print(d.shape,'\n')
        print(d['tmax'].isna().sum(),'\n')
        # d['time'] = pd.to_datetime(d['time'],errors='raise')
        # d.set_index('date', inplace=True)

        col = 'tmax'
        # First, attempt to fill missing values using decomposition
        # df_imputed = robust_interpolate_and_sarima(d, 'tmax')

        # # Check if there are still missing values, and use SARIMA if necessary
        # if d[f'{col}_imputed'].isnull().sum() > 0:
        #     sarima_forecast_imputation(d,col)
        df_imputed = robust_interpolate_and_sarima(d, 'tmax')

        print(df_imputed['tmax'].isna().sum(),'\n')
        print(df_imputed[df_imputed['tmax'] <= 0].shape)
        
        # d['tmax'] = d['tmax_imputed_sarima']
        
        df_imputed.to_csv(f'imputed_{i}')
        print(f'DONE{i}')
        
    except Exception as e: 
        print(f'ERROR in {i}')
        print(e)
        
print('all done')


c:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Bangalore_1990_2022_BangaloreCity.csv
(11894, 5) 

629 

0 

(0, 5)
DONEBangalore_1990_2022_BangaloreCity.csv
c:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Chennai_1990_2022_Madras.csv
(11894, 5) 

1019 

0 

(0, 5)
DONEChennai_1990_2022_Madras.csv
c:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Delhi_NCR_1990_2022_Safdarjung.csv
(11894, 5) 

533 

0 

(0, 5)
DONEDelhi_NCR_1990_2022_Safdarjung.csv
c:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Lucknow_1990_2022.csv
(11894, 5) 

1553 

0 

(0, 5)
DONELucknow_1990_2022.csv
c:\Users\PETE\Desktop\ADMIN\CAPSTONE\MODEL\archive\Temperature_And_Precipitation_Cities_IN\Mumbai_1990_2022_Santacruz.csv
(11894, 5) 

1907 

ERROR in Mumbai_1990_2022_Santacruz.csv
'int' object has no attribute 'days'
c:\Users\PETE\Desktop\ADMIN\CAP