In [None]:
import os
import pandas as pd

# Step 1: Read data from ASCII files
folder_path = "/Users/soumilhooda/Desktop/WD/Data-WD/Weather/ALL-VALID"
data = {}

for file_name in os.listdir(folder_path):
    if file_name.startswith("rainnn") or file_name.startswith("maxtmp") or file_name.startswith("mintmp"):
        year = file_name[-10:-6]
        data_type = file_name[:6]
        df = pd.read_csv(os.path.join(folder_path, file_name), delim_whitespace=True, header=None)
        num_days = df.shape[1] - 2  # Deducting 2 for Latitude and Longitude columns
        date_range = pd.date_range(start=f'{year}-01-01', periods=num_days)
        date_columns = date_range.strftime('%Y-%m-%d').tolist()
        df.columns = ['Latitude', 'Longitude'] + date_columns
        data[(data_type, year)] = df
        
# Step 2: Calculate average temperature
for year in range(1951, 2024):
    maxtmp = data[('maxtmp', str(year))]
    mintmp = data[('mintmp', str(year))]
    avg_tmp = (maxtmp.iloc[:, 2:] + mintmp.iloc[:, 2:]) / 2
    data[('avgtmp', str(year))] = pd.concat([maxtmp.iloc[:, :2], avg_tmp], axis=1)


# Step 3: Create dictionary with (latitude, longitude) pairs and rainfall/temperature data
result = {}

for year in range(1951, 2024):
    for lat, lon in zip(data[('rainnn', str(year))]['Latitude'], data[('rainnn', str(year))]['Longitude']):
        if (lat, lon) not in result:
            result[(lat, lon)] = pd.DataFrame(columns=['Rainfall', 'Temperature'])
        rain = data[('rainnn', str(year))][(data[('rainnn', str(year))]['Latitude'] == lat) & (data[('rainnn', str(year))]['Longitude'] == lon)].iloc[:, 2:].values.flatten()
        temp = data[('avgtmp', str(year))][(data[('avgtmp', str(year))]['Latitude'] == lat) & (data[('avgtmp', str(year))]['Longitude'] == lon)].iloc[:, 2:].values.flatten()
        date_index = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31')
        df = pd.DataFrame({'Rainfall': rain, 'Temperature': temp}, index=pd.DatetimeIndex(date_index))
        result[(lat, lon)] = pd.concat([result[(lat, lon)], df])

# Filter the result dictionary to keep only key-value pairs with 26663 rows
filtered_result = {key: value for key, value in result.items() if len(value) == 26663}

# Load GeoLocations.csv
geo_df = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/GeoLocations/GeoLocations.csv")

data = {}

for key, df in filtered_result.items():
    lat, lon = key
    state_df = geo_df[(geo_df['Latitude'] == lat) & (geo_df['Longitude'] == lon)]
    if not state_df.empty:
        state_name = state_df['State Name'].iloc[0]
        # Check if the state name is not 'MAHARASHTRA'
        if state_name != 'MAHARASHTRA':
            new_key = (lat, lon, state_name)
            data[new_key] = df
        else:
            print(f"Latitude {lat} and longitude {lon} corresponds to the state of Maharashtra, skipping...")
    else:
        print(f"No state found for latitude {lat} and longitude {lon}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

temperature = {
    'Date': pd.date_range(start='2018-01-01', end='2023-12-31'),
}

# Create DataFrame from dictionary
temperature = pd.DataFrame(temperature) 


def mean_and_variance(filtered_result, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Rainfall': [], 'Count': 0}
        # Append data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Rainfall'].append(df['Rainfall'])
        state_data[state_name]['Count'] += 1

    # Calculate mean temperature for each state
    for i, (state, data) in enumerate(state_data.items()):
        mean_temp = np.mean(data['Temperature'],axis=0)
        temperature[state] = mean_temp

# Example usage with custom start and end dates
start_date = '2018-01-01'
end_date = '2023-12-31'
mean_and_variance(data, start_date, end_date)

electricity = pd.read_csv('/Users/soumilhooda/Desktop/WD/Data-WD/Electricity/Electricity.csv')
electricity["Date"] = pd.to_datetime(electricity["Date"])
electricity.set_index("Date", inplace=True)
temperature["Date"] = pd.to_datetime(temperature["Date"])
temperature.set_index("Date", inplace=True)


In [None]:
import pandas as pd
import numpy as np

electricity = electricity.sort_index()
temperature = temperature.sort_index()

states = ['PUNJAB', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'MADHYA PRADESH', 'BIHAR', 'UTTAR PRADESH', 'KARNATAKA', 'TELANGANA', 'ANDHRA PRADESH', 'RAJASTHAN', 'ORISSA']

def find_tref_peak_correlation_HDD(state, electricity_data, temperature_data):
    """Finds Tref based on the temperature at maximum correlation"""

    # Calculate rolling correlations with a weekly window
    correlations = electricity_data.rolling(window='90D').corr(temperature_data)

    # Find base temperatures
    tref_hdd = temperature_data.loc[correlations.idxmin()]

    return tref_hdd

def find_tref_peak_correlation_CDD(state, electricity_data, temperature_data):
    """Finds Tref based on the temperature at maximum correlation"""

    # Calculate rolling correlations with a weekly window
    correlations = electricity_data.rolling(window='90D').corr(temperature_data)

    # Find base temperatures
    tref_cdd = temperature_data.loc[correlations.idxmax()]  # Temperature at strongest positive corr

    return tref_cdd

# Calculate Trefs
trefs = {}
for state in states:
    electricity[state] = electricity[state].replace('-', np.nan).astype(float)
    electricity[state] = electricity[state].interpolate(method='linear')

    trefs[f"{state}_HDD"] = find_tref_peak_correlation_HDD(state, electricity[state], temperature[state])
    trefs[f"{state}_CDD"] = find_tref_peak_correlation_CDD(state, electricity[state], temperature[state])


In [None]:
import pandas as pd
import numpy as np

# ... (Load your data, ensure indexes are sorted) ...

states = ['PUNJAB', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'MADHYA PRADESH', 'BIHAR', 'UTTAR PRADESH', 'KARNATAKA', 'TELANGANA', 'ANDHRA PRADESH', 'RAJASTHAN', 'ORISSA']
WINTER_MONTHS = [1, 2, 10, 11, 12]
MONSOON_MONTHS = [3, 4, 5, 6, 7, 8, 9]
TEMP_RANGE = 2  # 2 degrees Celsius range

def find_tref_peak_correlation_HDD(state, electricity_data, temperature_data):
    # Filter data for winter months
    winter_data = electricity_data[electricity_data.index.month.isin(WINTER_MONTHS)]
    winter_temp = temperature_data[temperature_data.index.month.isin(WINTER_MONTHS)]

    correlations = winter_data.rolling(window='90D').corr(winter_temp)
    avg_temp_neg_corr = temperature_data.loc[correlations.idxmin()].mean()

    # Find base temp within +- TEMP_RANGE of avg_temp_neg_corr
    temp_lower = avg_temp_neg_corr - TEMP_RANGE
    temp_upper = avg_temp_neg_corr + TEMP_RANGE
    tref_hdd = temperature_data[(temperature_data >= temp_lower) & (temperature_data <= temp_upper)].mean()

    return tref_hdd

def find_tref_peak_correlation_CDD(state, electricity_data, temperature_data):
    # Filter data for monsoon months
    monsoon_data = electricity_data[electricity_data.index.month.isin(MONSOON_MONTHS)]
    monsoon_temp = temperature_data[temperature_data.index.month.isin(MONSOON_MONTHS)]

    correlations = monsoon_data.rolling(window='90D').corr(monsoon_temp)
    avg_temp_pos_corr = temperature_data.loc[correlations.idxmax()].mean()

    # Find base temp within +- TEMP_RANGE of avg_temp_pos_corr
    temp_lower = avg_temp_pos_corr - TEMP_RANGE
    temp_upper = avg_temp_pos_corr + TEMP_RANGE
    tref_cdd = temperature_data[(temperature_data >= temp_lower) & (temperature_data <= temp_upper)].mean()

    return tref_cdd

# Calculate Trefs
trefs = {}
for state in states:
    electricity[state] = electricity[state].replace('-', np.nan).astype(float)
    electricity[state] = electricity[state].interpolate(method='linear')

    trefs[f"{state}_HDD"] = find_tref_peak_correlation_HDD(state, electricity[state], temperature[state])
    trefs[f"{state}_CDD"] = find_tref_peak_correlation_CDD(state, electricity[state], temperature[state])


In [None]:
import pandas as pd
import numpy as np

states = ['PUNJAB', 'GUJARAT', 'HARYANA', 'HIMACHAL PRADESH', 'MADHYA PRADESH', 'BIHAR', 'UTTAR PRADESH', 'KARNATAKA', 'TELANGANA', 'ANDHRA PRADESH', 'RAJASTHAN', 'ORISSA']
WINTER_MONTHS = [1, 2, 10, 11, 12]
MONSOON_MONTHS = [3, 4, 5, 6, 7, 8, 9]

def find_tref_peak_correlation_HDD(state, electricity_data, temperature_data):
    # Filter data for winter months
    winter_data = electricity_data[electricity_data.index.month.isin(WINTER_MONTHS)]
    winter_temp = temperature_data[temperature_data.index.month.isin(WINTER_MONTHS)]

    # Sort the DataFrame by index
    winter_data = winter_data.sort_index()
    winter_temp = winter_temp.sort_index()

    correlations = winter_data.rolling(window='90D').corr(winter_temp)
    avg_temp_neg_corr = temperature_data.loc[correlations.idxmin()].mean()
    std_dev = temperature_data.loc[correlations.idxmin()].std()  # Calculate standard deviation

    # Find base temp within +- 1 standard deviation
    temp_lower = avg_temp_neg_corr 
    temp_upper = avg_temp_neg_corr + std_dev
    tref_hdd = temperature_data[(temperature_data >= temp_lower) & (temperature_data <= temp_upper)].mean()

    return tref_hdd

def find_tref_peak_correlation_CDD(state, electricity_data, temperature_data):
    # Filter data for monsoon months
    monsoon_data = electricity_data[electricity_data.index.month.isin(MONSOON_MONTHS)]
    monsoon_temp = temperature_data[temperature_data.index.month.isin(MONSOON_MONTHS)]

    # Sort the DataFrame by index
    monsoon_data = monsoon_data.sort_index()
    monsoon_temp = monsoon_temp.sort_index()

    correlations = monsoon_data.rolling(window='90D').corr(monsoon_temp)
    avg_temp_pos_corr = temperature_data.loc[correlations.idxmax()].mean()
    std_dev = temperature_data.loc[correlations.idxmax()].std()  # Calculate standard deviation

    # Find base temp within +- 1 standard deviation
    temp_lower = avg_temp_pos_corr - std_dev
    temp_upper = avg_temp_pos_corr 
    tref_cdd = temperature_data[(temperature_data >= temp_lower) & (temperature_data <= temp_upper)].mean()

    return tref_cdd

# Calculate Trefs
trefs = {}
for state in states:
    electricity[state] = electricity[state].replace('-', np.nan).astype(float)
    electricity[state] = electricity[state].interpolate(method='linear')

    trefs[f"{state}_HDD"] = find_tref_peak_correlation_HDD(state, electricity[state], temperature[state])
    trefs[f"{state}_CDD"] = find_tref_peak_correlation_CDD(state, electricity[state], temperature[state])


In [None]:
trefs