In [None]:
import os
import pandas as pd

# Step 1: Read data from ASCII files
folder_path = "/Users/soumilhooda/Desktop/WD/Data-WD/Weather/ALL-VALID"
data = {}

for file_name in os.listdir(folder_path):
    if file_name.startswith("rainnn") or file_name.startswith("maxtmp") or file_name.startswith("mintmp"):
        year = file_name[-10:-6]
        data_type = file_name[:6]
        df = pd.read_csv(os.path.join(folder_path, file_name), delim_whitespace=True, header=None)
        num_days = df.shape[1] - 2  # Deducting 2 for Latitude and Longitude columns
        date_range = pd.date_range(start=f'{year}-01-01', periods=num_days)
        date_columns = date_range.strftime('%Y-%m-%d').tolist()
        df.columns = ['Latitude', 'Longitude'] + date_columns
        data[(data_type, year)] = df
        
# Step 2: Calculate average temperature
for year in range(1951, 2024):
    maxtmp = data[('maxtmp', str(year))]
    mintmp = data[('mintmp', str(year))]
    avg_tmp = (maxtmp.iloc[:, 2:] + mintmp.iloc[:, 2:]) / 2
    data[('avgtmp', str(year))] = pd.concat([maxtmp.iloc[:, :2], avg_tmp], axis=1)


# Step 3: Create dictionary with (latitude, longitude) pairs and rainfall/temperature data
result = {}

for year in range(1951, 2024):
    for lat, lon in zip(data[('rainnn', str(year))]['Latitude'], data[('rainnn', str(year))]['Longitude']):
        if (lat, lon) not in result:
            result[(lat, lon)] = pd.DataFrame(columns=['Rainfall', 'Temperature'])
        rain = data[('rainnn', str(year))][(data[('rainnn', str(year))]['Latitude'] == lat) & (data[('rainnn', str(year))]['Longitude'] == lon)].iloc[:, 2:].values.flatten()
        temp = data[('avgtmp', str(year))][(data[('avgtmp', str(year))]['Latitude'] == lat) & (data[('avgtmp', str(year))]['Longitude'] == lon)].iloc[:, 2:].values.flatten()
        date_index = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31')
        df = pd.DataFrame({'Rainfall': rain, 'Temperature': temp}, index=pd.DatetimeIndex(date_index))
        result[(lat, lon)] = pd.concat([result[(lat, lon)], df])

# Filter the result dictionary to keep only key-value pairs with 26663 rows
filtered_result = {key: value for key, value in result.items() if len(value) == 26663}

In [None]:
# Load GeoLocations.csv
geo_df = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/GeoLocations/GeoLocations.csv")

data = {}

for key, df in filtered_result.items():
    lat, lon = key
    state_df = geo_df[(geo_df['Latitude'] == lat) & (geo_df['Longitude'] == lon)]
    if not state_df.empty:
        state_name = state_df['State Name'].iloc[0]
        # Check if the state name is not 'MAHARASHTRA'
        if state_name != 'MAHARASHTRA':
            new_key = (lat, lon, state_name)
            data[new_key] = df
        else:
            print(f"Latitude {lat} and longitude {lon} corresponds to the state of Maharashtra, skipping...")
    else:
        print(f"No state found for latitude {lat} and longitude {lon}")

In [None]:
weather = data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to calculate monthly cumulative rainfall for the last year
def calculate_monthly_cumulative_rainfall(weather_data):
    last_year_data = weather_data['2023']
    monthly_cumulative_rainfall = last_year_data.groupby(last_year_data.index.month).sum()['Rainfall']
    return monthly_cumulative_rainfall

# Calculate monthly cumulative rainfall for the first lat, lon pair of each state
monthly_cumulative_rainfall = {}
for (lat, lon, state), state_data in weather.items():
    monthly_cumulative_rainfall[state] = calculate_monthly_cumulative_rainfall(state_data)

# Convert the monthly cumulative rainfall into a DataFrame
monthly_cumulative_rainfall_df = pd.DataFrame(monthly_cumulative_rainfall)


In [None]:
import pandas as pd

# Your required rainfall data (make sure this is filled in completely)
required_rainfall = {
('ANDHRA PRADESH', 1): 103.013,
 ('ANDHRA PRADESH', 2): 93.044,
 ('ANDHRA PRADESH', 3): 103.013,
 ('ANDHRA PRADESH', 4): 99.69,
 ('ANDHRA PRADESH', 5): 103.013,
 ('ANDHRA PRADESH', 6): 99.15231917819997,
 ('ANDHRA PRADESH', 7): 102.45739648414,
 ('ANDHRA PRADESH', 8): 102.45739648414,
 ('ANDHRA PRADESH', 9): 99.15231917819997,
 ('ANDHRA PRADESH', 10): 102.45739648414,
 ('ANDHRA PRADESH', 11): 99.69,
 ('ANDHRA PRADESH', 12): 103.013,
 ('BIHAR', 1): 32.5101428815,
 ('BIHAR', 2): 29.364000022,
 ('BIHAR', 3): 32.5101428815,
 ('BIHAR', 4): 31.461428594999997,
 ('BIHAR', 5): 114.42099999999999,
 ('BIHAR', 6): 181.11249757800002,
 ('BIHAR', 7): 187.1495808306,
 ('BIHAR', 8): 187.1495808306,
 ('BIHAR', 9): 181.11249757800002,
 ('BIHAR', 10): 131.6364857097,
 ('BIHAR', 11): 31.461428594999997,
 ('BIHAR', 12): 32.5101428815,
 ('GUJARAT', 1): 28.6039668666,
 ('GUJARAT', 2): 25.8358410408,
 ('GUJARAT', 3): 106.392,
 ('GUJARAT', 4): 99.52799999999999,
 ('GUJARAT', 5): 106.392,
 ('GUJARAT', 6): 108.489086847,
 ('GUJARAT', 7): 112.1053897419,
 ('GUJARAT', 8): 112.1053897419,
 ('GUJARAT', 9): 108.489086847,
 ('GUJARAT', 10): 129.67228061121,
 ('GUJARAT', 11): 27.681258258,
 ('GUJARAT', 12): 28.6039668666,
 ('HARYANA', 1): 45.7540256211,
 ('HARYANA', 2): 16.5221538412,
 ('HARYANA', 3): 8.922854818400001,
 ('HARYANA', 4): 8.635020791999999,
 ('HARYANA', 5): 102.6695199938,
 ('HARYANA', 6): 99.357599994,
 ('HARYANA', 7): 102.6695199938,
 ('HARYANA', 8): 102.6695199938,
 ('HARYANA', 9): 99.357599994,
 ('HARYANA', 10): 32.61265955445,
 ('HARYANA', 11): 14.6891489115,
 ('HARYANA', 12): 18.292384609899997,
 ('KARNATAKA', 1): 104.041095904,
 ('KARNATAKA', 2): 93.972602752,
 ('KARNATAKA', 3): 104.041095904,
 ('KARNATAKA', 4): 100.68493151999999,
 ('KARNATAKA', 5): 104.041095904,
 ('KARNATAKA', 6): 77.873755308,
 ('KARNATAKA', 7): 112.4238729136,
 ('KARNATAKA', 8): 118.8485556331,
 ('KARNATAKA', 9): 120.63777795300001,
 ('KARNATAKA', 10): 102.3738134521,
 ('KARNATAKA', 11): 100.68493151999999,
 ('KARNATAKA', 12): 104.041095904,
 ('MADHYA PRADESH', 1): 14.384553277460002,
 ('MADHYA PRADESH', 2): 12.99249973448,
 ('MADHYA PRADESH', 3): 6.4924987361,
 ('MADHYA PRADESH', 4): 6.2830632930000005,
 ('MADHYA PRADESH', 5): 104.308630151,
 ('MADHYA PRADESH', 6): 70.04310685200001,
 ('MADHYA PRADESH', 7): 115.5899069405,
 ('MADHYA PRADESH', 8): 124.972010297,
 ('MADHYA PRADESH', 9): 129.21888381,
 ('MADHYA PRADESH', 10): 85.96317898036,
 ('MADHYA PRADESH', 11): 13.920535429800001,
 ('MADHYA PRADESH', 12): 14.384553277460002,
 ('MAHARASHTRA', 1): 67.4371330869,
 ('MAHARASHTRA', 2): 60.9109589172,
 ('MAHARASHTRA', 3): 67.4371330869,
 ('MAHARASHTRA', 4): 100.68493151999999,
 ('MAHARASHTRA', 5): 104.041095904,
 ('MAHARASHTRA', 6): 135.843201876,
 ('MAHARASHTRA', 7): 153.9356435014,
 ('MAHARASHTRA', 8): 156.9406641729,
 ('MAHARASHTRA', 9): 154.542971817,
 ('MAHARASHTRA', 10): 131.42229532340002,
 ('MAHARASHTRA', 11): 65.261741697,
 ('MAHARASHTRA', 12): 67.4371330869,
 ('ORISSA', 1): 0.0,
 ('ORISSA', 2): 0.0,
 ('ORISSA', 3): 0.0,
 ('ORISSA', 4): 0.0,
 ('ORISSA', 5): 0.0,
 ('ORISSA', 6): 176.47058823,
 ('ORISSA', 7): 182.352941171,
 ('ORISSA', 8): 182.352941171,
 ('ORISSA', 9): 176.47058823,
 ('ORISSA', 10): 182.352941171,
 ('ORISSA', 11): 0.0,
 ('ORISSA', 12): 0.0,
 ('PUNJAB', 1): 14.600999999999999,
 ('PUNJAB', 2): 13.187999999999999,
 ('PUNJAB', 3): 14.600999999999999,
 ('PUNJAB', 4): 14.129999999999999,
 ('PUNJAB', 5): 0.0,
 ('PUNJAB', 6): 0.0,
 ('PUNJAB', 7): 0.0,
 ('PUNJAB', 8): 0.0,
 ('PUNJAB', 9): 0.0,
 ('PUNJAB', 10): 14.600999999999999,
 ('PUNJAB', 11): 14.129999999999999,
 ('PUNJAB', 12): 14.600999999999999,
 ('RAJASTHAN', 1): 25.224441278960004,
 ('RAJASTHAN', 2): 22.78336631648,
 ('RAJASTHAN', 3): 25.224441278960004,
 ('RAJASTHAN', 4): 4.24528302,
 ('RAJASTHAN', 5): 138.78071896600002,
 ('RAJASTHAN', 6): 86.841824964,
 ('RAJASTHAN', 7): 127.210197901,
 ('RAJASTHAN', 8): 133.743525491,
 ('RAJASTHAN', 9): 134.86484597999998,
 ('RAJASTHAN', 10): 53.72009527089,
 ('RAJASTHAN', 11): 24.4107496248,
 ('RAJASTHAN', 12): 25.224441278960004,
 ('TELANGANA', 1): 0.0,
 ('TELANGANA', 2): 0.0,
 ('TELANGANA', 3): 0.0,
 ('TELANGANA', 4): 0.0,
 ('TELANGANA', 5): 0.0,
 ('TELANGANA', 6): 102.51291855,
 ('TELANGANA', 7): 105.930015835,
 ('TELANGANA', 8): 105.930015835,
 ('TELANGANA', 9): 102.51291855,
 ('TELANGANA', 10): 105.930015835,
 ('TELANGANA', 11): 0.0,
 ('TELANGANA', 12): 0.0,
 ('UTTAR PRADESH', 1): 31.257259924440003,
 ('UTTAR PRADESH', 2): 28.23236380272,
 ('UTTAR PRADESH', 3): 31.257259924440003,
 ('UTTAR PRADESH', 4): 29.612979858,
 ('UTTAR PRADESH', 5): 99.6402,
 ('UTTAR PRADESH', 6): 116.598593328,
 ('UTTAR PRADESH', 7): 120.4852131056,
 ('UTTAR PRADESH', 8): 120.4852131056,
 ('UTTAR PRADESH', 9): 116.598593328,
 ('UTTAR PRADESH', 10): 61.9350817251,
 ('UTTAR PRADESH', 11): 30.2489612172,
 ('UTTAR PRADESH', 12): 31.257259924440003
}

# Create a list to hold your data
data = []
for state in ['GUJARAT', 'RAJASTHAN', 'KARNATAKA', 'MADHYA PRADESH', 'PUNJAB', 'HARYANA', 'HIMACHAL PRADESH', 'ANDHRA PRADESH', 'TELANGANA', 'UTTAR PRADESH', 'ORISSA', 'BIHAR']:
    data.append([required_rainfall.get((state, month), 0) for month in range(1, 13)])

# Create the DataFrame
required_rainfall_df = pd.DataFrame(data, 
                                     index=range(1, 13), 
                                     columns=['GUJARAT', 'RAJASTHAN', 'KARNATAKA', 'MADHYA PRADESH', 'PUNJAB', 'HARYANA', 'HIMACHAL PRADESH', 'ANDHRA PRADESH', 'TELANGANA', 'UTTAR PRADESH', 'ORISSA', 'BIHAR'])


In [None]:
monthly_cumulative_rainfall_df

In [None]:
required_rainfall_df

In [None]:
deficit_rainfall = monthly_cumulative_rainfall_df - required_rainfall_df
deficit_rainfall

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Normalize the data between -1 and 1
# normalized_df = (deficit_rainfall - deficit_rainfall.min()) / (deficit_rainfall.max() - deficit_rainfall.min()) * 2 - 1
normalized_df = (deficit_rainfall)

# Plot normalized deficit for each state
plt.figure(figsize=(12, 6))
for state in normalized_df.columns:
    plt.plot(normalized_df.index, normalized_df[state], label=state)

# Customize the plot
plt.title(' Monthly Estimated Rainfall Deficit/Excess')
plt.xlabel('Month')
plt.ylabel('Rainfall Deficit/Excess (mm)')
plt.legend()
# plt.grid(True)
plt.xticks(range(1, 13), ['Jan 23', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec 23'])
# Set yticks at every 20 units
plt.yticks(range(-200, 800, 50))
# plt.ylim(-1, 1)  # Enforce y-axis limits
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Normalize data between -1 and 1
deficit_rainfall_norm = (deficit_rainfall - deficit_rainfall.min()) / (deficit_rainfall.max() - deficit_rainfall.min()) * 2 - 1

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_mean_and_variance(filtered_result, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Rainfall': [], 'Count': 0}
        # Append data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Rainfall'].append(df['Rainfall'])
        state_data[state_name]['Count'] += 1

    # Plot mean and variance for each state
    for state, data in state_data.items():
        # Create a new figure for each state
        plt.figure(figsize=(20, 5))
        
        # Calculate mean and standard deviation for temperature and rainfall
        mean_temp = np.mean(data['Temperature'], axis=0)
        mean_rainfall = np.mean(data['Rainfall'], axis=0)
        std_temp = np.sqrt(np.var(data['Temperature'], axis=0))
        std_rainfall = np.sqrt(np.var(data['Rainfall'], axis=0))

        # Plot mean and variance for temperature
        plt.subplot(1, 2, 1)
        plt.plot(df.index, mean_temp, label='Mean Temperature', color='blue')
        plt.fill_between(df.index, mean_temp - std_temp, mean_temp + std_temp,
                         color='lightblue', label='Variance (Temperature)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Temperature (°C)', fontsize=12, fontweight='bold')
        plt.title(f'2023 Temperature for {state} (Cities={data["Count"]})', fontsize=14, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot mean and variance for rainfall
        plt.subplot(1, 2, 2)
        plt.plot(df.index, mean_rainfall, label='Mean Rainfall', color='green')
        plt.fill_between(df.index, mean_rainfall - std_rainfall, mean_rainfall + std_rainfall,
                         color='lightgreen', label='Variance (Rainfall)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Rainfall (mm)', fontsize=12, fontweight='bold')
        plt.title(f'2023 Rainfall for {state} (Cities={data["Count"]})', fontsize=14, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Adjust layout and save each state's plot separately
        plt.tight_layout()
        plt.savefig(f'{state}_1951_mean_and_variance.png')
        plt.show()

# Example usage with custom start and end dates
start_date = '2023-01-01'
end_date = '2023-12-31'
plot_mean_and_variance(data, start_date, end_date)


In [None]:
electricity = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/Electricity/Electricity.csv")
energystocks = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/Energy Stocks/Overall_EnergyStocks.csv")
powerstocks = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/Power Stocks/Overall_PowerStocks.csv")
windenergy = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/Renewables/Renewables-Prepared/WindEnergy.csv")
solarenergy = pd.read_csv("/Users/soumilhooda/Desktop/WD/Data-WD/Renewables/Renewables-Prepared/SolarEnergy.csv")
windenergy.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
windenergy.set_index('Date', inplace=True)
solarenergy.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
solarenergy.set_index('Date', inplace=True)

In [None]:
electricity.drop(columns=electricity.columns[electricity.columns.str.startswith('Unnamed:')], inplace=True)
energystocks = energystocks.rename(columns=lambda x: x.replace('AVG_', 'Price_'))
powerstocks = powerstocks.rename(columns=lambda x: x.replace('AVG_', 'Price_'))
electricity['Date'] = pd.to_datetime(electricity['Date'])
electricity.set_index('Date', inplace=True)
electricity.interpolate(method='linear', inplace=True)
electricity.replace(0, np.nan, inplace=True)
electricity.interpolate(method='linear', inplace=True)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def normalize(data):
    # Convert data to numeric, replacing non-numeric values with NaN
    numeric_data = pd.to_numeric(data, errors='coerce')
    # Normalize data between 0 and 1
    min_val = numeric_data.min()
    max_val = numeric_data.max()
    return (numeric_data - min_val) / (max_val - min_val)


def plot_mean_and_variance_and_electricity(filtered_result, electricity, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}
    correlations = {}

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        # Retrieve electricity data for the specified state
        electricity_state = electricity[state_name]
        # Convert electricity_state to numeric, replacing non-numeric values with NaN
        electricity_state = pd.to_numeric(electricity_state, errors='coerce')
        if start_date and end_date:
            electricity_state = electricity_state.loc[start_date:end_date]
        # Normalize electricity data
        electricity_state_normalized = normalize(electricity_state)
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Rainfall': [], 'Electricity': electricity_state_normalized, 'Count': 0}
        # Append data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Rainfall'].append(df['Rainfall'])
        state_data[state_name]['Count'] += 1

        # # Calculate correlations
        # correlations[state_name] = {
        #     'Temperature-Electricity': pearsonr(df['Temperature'], electricity_state_normalized),
        #     'Rainfall-Electricity': pearsonr(df['Rainfall'], electricity_state_normalized)
        # }

    # Plot mean and variance for each state
    for state, data in state_data.items():
        # Create a new figure for each state
        plt.figure(figsize=(20, 5))
        
        # Calculate mean and standard deviation for temperature and rainfall
        mean_temp = np.mean(data['Temperature'], axis=0)
        mean_rainfall = np.mean(data['Rainfall'], axis=0)
        std_temp = np.sqrt(np.var(data['Temperature'], axis=0))
        std_rainfall = np.sqrt(np.var(data['Rainfall'], axis=0))

        # Plot mean and variance for temperature
        plt.subplot(1, 2, 1)
        plt.plot(df.index, mean_temp, label='Mean Temperature', color='blue')
        plt.fill_between(df.index, mean_temp - std_temp, mean_temp + std_temp,
                         color='lightblue', label='Variance (Temperature)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Temperature (°C)', fontsize=12, fontweight='bold')
        plt.title(f'Electricity Consumption vs Temperature for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot electricity consumption
        plt.twinx().plot(df.index, data['Electricity'], label='Electricity', color='red')
        plt.ylabel('Electricity Consumption (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Plot mean and variance for rainfall
        plt.subplot(1, 2, 2)
        plt.plot(df.index, mean_rainfall, label='Mean Rainfall', color='green')
        plt.fill_between(df.index, mean_rainfall - std_rainfall, mean_rainfall + std_rainfall,
                         color='lightgreen', label='Variance (Rainfall)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Rainfall (mm)', fontsize=12, fontweight='bold')
        plt.title(f'Electricity Consumption vs Rainfall for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot electricity consumption
        plt.twinx().plot(df.index, data['Electricity'], label='Electricity', color='red')
        plt.ylabel('Electricity Consumption (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Adjust layout and save each state's plot separately
        plt.tight_layout()
        # plt.savefig(f'{state}_mean_and_variance_and_electricity.png')
        plt.show()

    # # Print correlations and significance
    # print("Correlations:")
    # for state, corr in correlations.items():
    #     print(f"{state}:")
    #     print(f"  Temperature-Electricity: Correlation={corr['Temperature-Electricity'][0]}, p-value={corr['Temperature-Electricity'][1]}")
    #     print(f"  Rainfall-Electricity: Correlation={corr['Rainfall-Electricity'][0]}, p-value={corr['Rainfall-Electricity'][1]}")
    #     print()

# Example usage with custom start and end dates
start_date = '2019-11-15'
end_date = '2019-12-31'
plot_mean_and_variance_and_electricity(data, electricity, start_date, end_date)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def normalize(data):
    # Convert data to numeric, replacing non-numeric values with NaN
    numeric_data = pd.to_numeric(data, errors='coerce')
    # Normalize data between 0 and 1
    min_val = numeric_data.min()
    max_val = numeric_data.max()
    return (numeric_data - min_val) / (max_val - min_val)


def plot_mean_and_variance_and_electricity(filtered_result, windenergy, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}
    correlations = {}

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        windenergy.fillna(0, inplace=True)
        # Retrieve windenergy data for the specified state
        windenergy_state = windenergy[state_name]
        # Convert windenergy_state to numeric, replacing non-numeric values with NaN
        windenergy_state = pd.to_numeric(windenergy_state, errors='coerce')
        if start_date and end_date:
            windenergy_state = windenergy_state.loc[start_date:end_date]
        # Normalize winf energy data
        windenergy_state_normalized = normalize(windenergy_state)
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Rainfall': [], 'Wind Energy': windenergy_state_normalized, 'Count': 0}
        # Append data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Rainfall'].append(df['Rainfall'])
        state_data[state_name]['Count'] += 1

        # # Calculate correlations
        # correlations[state_name] = {
        #     'Temperature-WindEnergy': pearsonr(df['Temperature'], windenergy_state_normalized),
        #     'Rainfall-WindEnergy': pearsonr(df['Rainfall'], windenergy_state_normalized)
        # }

    # Plot mean and variance for each state
    for state, data in state_data.items():
        # Create a new figure for each state
        plt.figure(figsize=(20, 5))
        
        # Calculate mean and standard deviation for temperature and rainfall
        mean_temp = np.mean(data['Temperature'], axis=0)
        mean_rainfall = np.mean(data['Rainfall'], axis=0)
        std_temp = np.sqrt(np.var(data['Temperature'], axis=0))
        std_rainfall = np.sqrt(np.var(data['Rainfall'], axis=0))

        # Plot mean and variance for temperature
        plt.subplot(1, 2, 1)
        plt.plot(df.index, mean_temp, label='Mean Temperature', color='blue')
        plt.fill_between(df.index, mean_temp - std_temp, mean_temp + std_temp,
                         color='lightblue', label='Variance (Temperature)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Temperature (°C)', fontsize=12, fontweight='bold')
        plt.title(f'Wind Energy Production vs Temperature for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot wind energy consumption
        plt.twinx().plot(df.index, data['Wind Energy'], label='Wind Energy', color='red')
        plt.ylabel('Wind Energy Production (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Plot mean and variance for rainfall
        plt.subplot(1, 2, 2)
        plt.plot(df.index, mean_rainfall, label='Mean Rainfall', color='green')
        plt.fill_between(df.index, mean_rainfall - std_rainfall, mean_rainfall + std_rainfall,
                         color='lightgreen', label='Variance (Rainfall)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Rainfall (mm)', fontsize=12, fontweight='bold')
        plt.title(f'Wind Energy Production vs Rainfall for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot wind energy consumption
        plt.twinx().plot(df.index, data['Wind Energy'], label='Wind Energy', color='red')
        plt.ylabel('Wind Energy Production (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Adjust layout and save each state's plot separately
        start_year = pd.to_datetime(start_date).year  # Extract start year from start_date
        plt.tight_layout()
        plt.savefig(f'{state}_{start_year}.png')
        plt.show()

    # # Print correlations and significance
    # print("Correlations:")
    # for state, corr in correlations.items():
    #     print(f"{state}:")
    #     print(f"  Temperature-WindEnergy: Correlation={corr['Temperature-WindEnergy'][0]}, p-value={corr['Temperature-WindEnergy'][1]}")
    #     print(f"  Rainfall-WindEnergy: Correlation={corr['Rainfall-WindEnergy'][0]}, p-value={corr['Rainfall-WindEnergy'][1]}")
    #     print()

# Example usage with custom start and end dates
start_date = '2023-01-01'
end_date = '2023-12-31'
plot_mean_and_variance_and_electricity(data, windenergy, start_date, end_date)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

def normalize(data):
    # Convert data to numeric, replacing non-numeric values with NaN
    numeric_data = pd.to_numeric(data, errors='coerce')
    # Normalize data between 0 and 1
    min_val = numeric_data.min()
    max_val = numeric_data.max()
    return (numeric_data - min_val) / (max_val - min_val)


def plot_mean_and_variance_and_electricity(filtered_result, windenergy, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}
    correlations = {}

    # Apply plotting conditions before creating figures
    plt.rcParams.update({
        'text.color': 'white',
        'axes.labelcolor': 'white',
        'axes.edgecolor': 'white',
        'xtick.color': 'white',
        'ytick.color': 'white',
        'legend.facecolor': 'none',
        'legend.edgecolor': 'white',
        'legend.fontsize': 'large',
        'legend.labelspacing': 0.5,
        'legend.title_fontsize': 'large',
        'figure.facecolor': 'none',
        'axes.facecolor': 'none',
        'axes.grid': False,
    })

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        windenergy.fillna(0, inplace=True)
        # Retrieve windenergy data for the specified state
        windenergy_state = windenergy[state_name]
        # Convert windenergy_state to numeric, replacing non-numeric values with NaN
        windenergy_state = pd.to_numeric(windenergy_state, errors='coerce')
        if start_date and end_date:
            windenergy_state = windenergy_state.loc[start_date:end_date]
        # Normalize winf energy data
        windenergy_state_normalized = normalize(windenergy_state)
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Rainfall': [], 'Wind Energy': windenergy_state_normalized, 'Count': 0}
        # Append data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Rainfall'].append(df['Rainfall'])
        state_data[state_name]['Count'] += 1

        # # Calculate correlations
        # correlations[state_name] = {
        #     'Temperature-WindEnergy': pearsonr(df['Temperature'], windenergy_state_normalized),
        #     'Rainfall-WindEnergy': pearsonr(df['Rainfall'], windenergy_state_normalized)
        # }

    # Plot mean and variance for each state
    for state, data in state_data.items():
        # Create a new figure for each state
        plt.figure(figsize=(20, 5))
        
        # Calculate mean and standard deviation for temperature and rainfall
        mean_temp = np.mean(data['Temperature'], axis=0)
        mean_rainfall = np.mean(data['Rainfall'], axis=0)
        std_temp = np.sqrt(np.var(data['Temperature'], axis=0))
        std_rainfall = np.sqrt(np.var(data['Rainfall'], axis=0))

        # Plot mean and variance for temperature
        plt.subplot(1, 2, 1)
        plt.plot(df.index, mean_temp, label='Mean Temperature', color='blue')
        plt.fill_between(df.index, mean_temp - std_temp, mean_temp + std_temp,
                         color='lightblue', label='Variance (Temperature)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Temperature (°C)', fontsize=12, fontweight='bold')
        plt.title(f'Solar Energy Production vs Temperature for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot wind energy consumption
        plt.twinx().plot(df.index, data['Wind Energy'], label='Solar Energy', color='red')
        plt.ylabel('Solar Energy Production (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Plot mean and variance for rainfall
        plt.subplot(1, 2, 2)
        plt.plot(df.index, mean_rainfall, label='Mean Rainfall', color='green')
        plt.fill_between(df.index, mean_rainfall - std_rainfall, mean_rainfall + std_rainfall,
                         color='lightgreen', label='Variance (Rainfall)')
        plt.xlabel('Date', fontsize=12, fontweight='bold')
        plt.ylabel('Rainfall (mm)', fontsize=12, fontweight='bold')
        plt.title(f'Solar Energy Production vs Rainfall for {state.title()} (Cities={data["Count"]})', fontsize=14, fontweight='bold')  # Modify state name here
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend()

        # Plot wind energy consumption
        plt.twinx().plot(df.index, data['Wind Energy'], label='Solar Energy', color='red')
        plt.ylabel('Solar Energy Production (Scaled MU)', fontsize=12, fontweight='bold')
        plt.xticks(fontsize=10, fontweight='bold')
        plt.yticks(fontsize=10, fontweight='bold')
        plt.legend(loc='upper left')

        # Adjust layout and save each state's plot separately
        start_year = pd.to_datetime(start_date).year  # Extract start year from start_date
        plt.tight_layout()
        plt.savefig(f'{state}_{start_year}.png')
        plt.show()

    # # Print correlations and significance
    # print("Correlations:")
    # for state, corr in correlations.items():
    #     print(f"{state}:")
    #     print(f"  Temperature-WindEnergy: Correlation={corr['Temperature-WindEnergy'][0]}, p-value={corr['Temperature-WindEnergy'][1]}")
    #     print(f"  Rainfall-WindEnergy: Correlation={corr['Rainfall-WindEnergy'][0]}, p-value={corr['Rainfall-WindEnergy'][1]}")
    #     print()

# Example usage with custom start and end dates
start_date = '2023-04-01'
end_date = '2023-12-30'
plot_mean_and_variance_and_electricity(data, solarenergy, start_date, end_date)

In [None]:
import matplotlib.pyplot as plt

# Convert the index to a DateTimeIndex
solarenergy.index = pd.to_datetime(solarenergy.index)

# Resample the data to get monthly aggregates
monthly_aggregate = solarenergy['GUJARAT'].resample('M').sum()

# Calculate the mean of the monthly aggregates
mean_monthly_aggregate = monthly_aggregate.mean()

# Plot the monthly aggregate solar energy and mean of the monthly aggregates
plt.figure(figsize=(10, 6))
monthly_aggregate.plot(label='Monthly Aggregate Solar Energy', marker='o')
plt.axhline(y=mean_monthly_aggregate, color='green', linestyle='--', label='Mean of Monthly Aggregates')
plt.title('Monthly Aggregate Solar Energy in Gujarat')
plt.xlabel('Date')
plt.ylabel('Solar Energy (MU)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Convert the index to a DateTimeIndex
windenergy.index = pd.to_datetime(windenergy.index)

# Resample the data to get monthly aggregates
monthly_aggregate = windenergy['GUJARAT'].resample('M').sum()

# Calculate the mean of the monthly aggregates
mean_monthly_aggregate = monthly_aggregate.mean()

# Plot the monthly aggregate solar energy and mean of the monthly aggregates
plt.figure(figsize=(10, 6))
monthly_aggregate.plot(label='Monthly Aggregate Wind Energy', marker='o')
plt.axhline(y=mean_monthly_aggregate, color='green', linestyle='--', label='Mean of Monthly Aggregates')
plt.title('Monthly Aggregate Wind Energy in Gujarat')
plt.xlabel('Date')
plt.ylabel('Wind Energy (MU)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

solarenergy.index = pd.to_datetime(solarenergy.index)

# Define start and end dates
start_date = '2022-01-01'
end_date = '2023-12-31'

# Slice the data for the specified date range
monthly_aggregate = solarenergy.loc[start_date:end_date, 'GUJARAT'].resample('M').sum()

# Calculate the mean of the monthly aggregates only for the specified date range
mean_monthly_aggregate = monthly_aggregate.mean()

# Plot the monthly aggregate solar energy
plt.figure(figsize=(8, 5))
monthly_aggregate.plot(label='Solar Energy', marker='o', color='red')

# Plot the mean of the monthly aggregates
plt.axhline(y=mean_monthly_aggregate, color='blue', linestyle='--', label='Solar Monthly Mean (Strike)')

# Plot marker points for every month on the mean line
for month in monthly_aggregate.index:
    plt.plot(month, mean_monthly_aggregate, marker='o', color='blue')

plt.title('Monthly Aggregate Solar Energy Production in Gujarat', weight='bold')
plt.xlabel('Date', weight='bold')
plt.ylabel('Solar Energy (MU)', weight='bold')
plt.legend()

# Make ticks bold
# plt.xticks(fontweight='bold')
# plt.yticks(fontweight='bold')

plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

windenergy.index = pd.to_datetime(windenergy.index)

# Define start and end dates
start_date = '2022-01-01'
end_date = '2023-12-31'

# Slice the data for the specified date range
monthly_aggregate = windenergy.loc[start_date:end_date, 'KARNATAKA'].resample('M').sum()

# Calculate the mean of the monthly aggregates only for the specified date range
mean_monthly_aggregate = monthly_aggregate.mean()

# Plot the monthly aggregate solar energy
plt.figure(figsize=(8, 5))
monthly_aggregate.plot(label='Wind Energy', marker='o', color='red')

# Plot the mean of the monthly aggregates
plt.axhline(y=mean_monthly_aggregate, color='blue', linestyle='--', label='Wind Monthly Mean (Strike)')

# Plot marker points for every month on the mean line
for month in monthly_aggregate.index:
    plt.plot(month, mean_monthly_aggregate, marker='o', color='blue')

plt.title('Monthly Aggregate Wind Energy Production in Karnataka', weight='bold')
plt.xlabel('Date', weight='bold')
plt.ylabel('Wind Energy (MU)', weight='bold')
plt.legend()

# Make ticks bold
# plt.xticks(fontweight='bold')
# plt.yticks(fontweight='bold')

plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Function to process and plot data for specified commodities
def process_and_plot_data_for_commodities(folder_path, commodities, start_date, end_date):
    for commodity in commodities:
        spot_price_df, trading_volume_df = process_files_for_commodity(folder_path, commodity)
        
        # Plot aggregated data with custom date range for the current commodity
        plot_dataframes(spot_price_df, trading_volume_df, start_date, end_date, commodity)

# Adjusted function to process files for a specific commodity
def process_files_for_commodity(folder_path, commodity_name):
    spot_price_data = []
    trading_volume_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            spot_price_df, trading_volume_df = process_file_for_commodity(file_path, commodity_name)
            spot_price_data.append(spot_price_df)
            trading_volume_data.append(trading_volume_df)

    spot_price_df = pd.concat(spot_price_data)
    trading_volume_df = pd.concat(trading_volume_data)

    spot_price_df.index = pd.to_datetime(spot_price_df.index)
    trading_volume_df.index = pd.to_datetime(trading_volume_df.index)

    spot_price_df.sort_index(inplace=True)
    trading_volume_df.sort_index(inplace=True)

    return spot_price_df, trading_volume_df

# Adjusted function to filter for a specific commodity
def process_file_for_commodity(file_path, commodity_name):
    df = pd.read_csv(file_path)
    commodity_df = df[df['Commodity Name'] == commodity_name]
    
    # Extract Spot Price and Trading Volume columns
    spot_price_df = commodity_df.groupby('Date')['Spot Price'].first()
    trading_volume_df = commodity_df.groupby('Date')['Trading Volume Lots'].apply(lambda x: x.astype(str).str.replace('-', '0').astype(int).sum())
    
    # Convert Spot Price to float and handle NaN values
    spot_price_df = pd.to_numeric(spot_price_df, errors='coerce')
    
    return spot_price_df, trading_volume_df


# Updated plotting function to include commodity name in the title
def plot_dataframes(spot_price_df, trading_volume_df, start_date, end_date, commodity_name):
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))

    spot_price_df = spot_price_df.replace('-', np.nan).astype(float)
    
    if start_date and end_date:
        spot_price_df = spot_price_df[(spot_price_df.index >= start_date) & (spot_price_df.index <= end_date)]
        trading_volume_df = trading_volume_df[(trading_volume_df.index >= start_date) & (trading_volume_df.index <= end_date)]
    
    # Cotton
    axes[0, 0].plot(spot_price_df.index, spot_price_df.values, label='Spot Price', color='blue')
    axes[0, 0].set_title(f'{commodity_name.title()} Spot Price over Time')
    axes[0, 0].set_xlabel('Date')
    axes[0, 0].set_ylabel('Spot Price')
    axes[0, 0].legend()

    axes[0, 1].plot(trading_volume_df.index, trading_volume_df.values, label='Trading Volume', color='green')
    axes[0, 1].set_title(f'{commodity_name.title()} Trading Volume over Time')
    axes[0, 1].set_xlabel('Date')
    axes[0, 1].set_ylabel('Trading Volume')
    axes[0, 1].legend()

    # Adjust layout
    plt.tight_layout()
    plt.show()


folder_path = r"/Users/soumilhooda/Desktop/WD/Data-WD/MCX-Commodity-Trading-Statistics/Cleaned"
commodities = ['COTTON', 'CRUDEOIL', 'NATURALGAS']
start_date = '2023-01-01'
end_date = '2023-12-31'

process_and_plot_data_for_commodities(folder_path, commodities, start_date, end_date)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def plot_temp_price_time(filtered_result, energystocks, start_date=None, end_date=None):
    # Create an empty dictionary to store aggregated data for each state
    state_data = {}

    # Iterate through keys and aggregate data by state
    for key, df in filtered_result.items():
        lat, lon, state_name = key
        # Apply start and end date filters
        if start_date and end_date:
            df = df.loc[start_date:end_date]
        # Initialize state data if not already present
        if state_name not in state_data:
            state_data[state_name] = {'Temperature': [], 'Count': 0}
        # Append temperature data to state data
        state_data[state_name]['Temperature'].append(df['Temperature'])
        state_data[state_name]['Count'] += 1

    # Create a list of relevant energy stock columns
    energy_columns = ['Price_South-ONGCEnergy', 'Volume_South-ONGCEnergy',
                      'Price_East-COALIndia', 'Volume_East-COALIndia',
                      'Price_West-BPCLEnergy', 'Volume_West-BPCLEnergy',
                      'Price_West-RelianceEnergy', 'Volume_West-RelianceEnergy',
                      'Price_East-OILEnergy', 'Volume_East-OILEnergy',
                      'Price_South-HPCLEnergy', 'Volume_South-HPCLEnergy',
                      'Price_North-GAILEnergy', 'Volume_North-GAILEnergy',
                      'Price_North-IOCLEnergy', 'Volume_North-IOCLEnergy']

    # Filter energystocks DataFrame based on date range
    energystocks_filtered = energystocks.loc[start_date:end_date]

    # Create date range
    date_range = pd.date_range(start=start_date, end=end_date)

    # Set the index of the energystocks DataFrame to match the date range
    energystocks_filtered.index = date_range

    # Initialize correlation DataFrame
    correlation_df = pd.DataFrame(index=energy_columns, columns=list(state_data.keys()))

    # Plotting
    fig, axes = plt.subplots(len(state_data), 1, figsize=(15, 8 * len(state_data)), sharex=True)

    for i, (state, data) in enumerate(state_data.items()):
        # Plot temperature
        mean_temp = np.mean(data['Temperature'], axis=0)
        axes[i].plot(date_range, mean_temp, label=f'{state} Temp', color='blue')
        axes[i].set_xlabel('Date')
        axes[i].set_ylabel('Temperature', color='blue')
        axes[i].tick_params(axis='y', labelcolor='blue')
        axes[i].set_title(f'Mean Temperature for {state}')

        # Create a twin y-axis for energy stock price
        ax2 = axes[i].twinx()

        # Plot energy stock data
        for col in energy_columns:
            if 'Price' in col:
                price_data = energystocks_filtered[col]
                price_data = price_data.dropna()
                ax2.plot(price_data.index, price_data, label=f'{col} ({state})', color='red')

        ax2.set_ylabel('Price', color='red')
        ax2.tick_params(axis='y', labelcolor='red')

        # Calculate correlation between temperature and energy stock data
        for col in energy_columns:
            if 'Price' in col:
                feature_data = pd.Series(mean_temp, index=date_range)
                feature_data = feature_data.dropna()
                correlation = feature_data.corr(energystocks_filtered[col])
                correlation_df.loc[col, state] = correlation

    # Set legend and plot layout
    for ax in axes:
        ax.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

    # Plot correlation matrix
    fig, ax = plt.subplots(figsize=(12, 8))
    cax = ax.matshow(correlation_df.astype(float), cmap='coolwarm')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + list(correlation_df.columns), rotation=45)
    ax.set_yticklabels([''] + list(correlation_df.index))
    ax.set_title('Correlation Matrix')
    plt.show()

# Example usage with custom start and end dates
start_date = '2023-01-01'
end_date = '2023-12-31'
plot_temp_price_time(data, energystocks, start_date, end_date)