<a href="https://colab.research.google.com/github/ptsnbkht/DI-Bootcamp/blob/main/w_i2d2dailychallengepynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import pandas as pd

# Load the dataset
df = pd.read_csv('gb.csv')

# Select only numerical columns
numerical_cols = df.select_dtypes(include=['number'])

# Calculate summary statistics for numerical columns
summary_stats = numerical_cols.agg(['mean', 'median', 'std'])

# Print the summary statistics
summary_stats


Unnamed: 0,capacity_mw,latitude,longitude,other_fuel3,commissioning_year,year_of_capacity_data,generation_gwh_2013,generation_gwh_2014,generation_gwh_2015,generation_gwh_2016,generation_gwh_2017,generation_gwh_2018,generation_gwh_2019,estimated_generation_gwh_2013,estimated_generation_gwh_2014,estimated_generation_gwh_2015,estimated_generation_gwh_2016,estimated_generation_gwh_2017
mean,197.089265,18.6108,29.323991,,2002.516369,2016.453236,889.706032,1166.690811,1140.812168,1088.958303,1157.189555,767.414528,,325.497627,369.478814,358.410634,370.064882,896.858128
median,27.41,31.94565,23.43455,,2008.0,2017.0,112.210278,154.885556,186.818333,173.931111,214.876389,98.441528,,39.29,43.63,44.505,46.37,76.91
std,587.58131,30.49611,83.121842,,16.343837,2.088585,2410.273786,2640.639529,2647.8855,2806.296634,2870.515615,2383.010753,,1588.089135,1888.098258,1824.399369,1904.324886,2808.491807


In [2]:

# Explore the distribution of power plants by country and fuel type
country_fuel_distribution = df.groupby(['country', 'primary_fuel']).size().unstack(fill_value=0)

# Print the distribution
country_fuel_distribution

primary_fuel,Biomass,Coal,Gas,Geothermal,Hydro,Nuclear,Oil,Other,Petcoke,Solar,Waste,Wave and Tidal,Wind
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AFG,0,0,1,0,6,0,0,0,0,2,0,0,0
AGO,0,0,3,0,5,0,6,0,0,0,0,0,0
ALB,0,0,0,0,7,0,0,1,0,0,0,0,0
ARG,0,9,57,0,50,3,96,2,0,7,0,0,12
ARM,0,0,3,0,4,1,0,0,0,0,0,0,0
ATA,0,0,0,0,0,0,1,0,0,0,0,0,1
AUS,25,28,134,0,73,0,42,0,0,69,50,0,65
AUT,0,0,3,0,96,0,0,0,0,0,0,0,4
AZE,0,0,8,0,5,0,1,0,0,0,0,0,0
BDI,0,0,0,0,3,0,0,0,0,1,0,0,0


In [5]:

power_output_col = 'capacity_mw'
fuel_type_col = 'primary_fuel'

# Check if the required columns exist
if power_output_col not in df.columns or fuel_type_col not in df.columns:
  print(
      "Error: Required columns not found in the DataFrame."
      f" Please ensure '{power_output_col}' and '{fuel_type_col}' are present."
  )
else:
  # Group data by fuel type and calculate statistics
  fuel_stats = df.groupby(fuel_type_col)[power_output_col].agg(
      ['mean', 'median', 'std', 'min', 'max']
  )

  # Rename columns for clarity
  fuel_stats.columns = [
      'Mean Power Output (MW)',
      'Median Power Output (MW)',
      'Standard Deviation (MW)',
      'Minimum Power Output (MW)',
      'Maximum Power Output (MW)',
  ]

  # Print the statistical analysis for each fuel type
  print("\nStatistical Analysis of Power Output by Fuel Type:")
fuel_stats


Statistical Analysis of Power Output by Fuel Type:


Unnamed: 0_level_0,Mean Power Output (MW),Median Power Output (MW),Standard Deviation (MW),Minimum Power Output (MW),Maximum Power Output (MW)
primary_fuel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Biomass,29.460474,14.0,40.003234,1.0,330.0
Coal,937.969507,660.0,895.698283,1.2,7000.0
Gas,279.497023,100.0,407.988343,1.0,2754.0
Geothermal,40.940909,25.1,43.462069,1.0,115.0
Hydro,181.114456,20.0,747.274417,1.0,22500.0
Nuclear,2082.3375,2000.0,1301.908631,20.0,4183.0
Oil,36.058766,3.38,122.131593,1.0,2400.0
Other,186.926,99.25,268.71016,6.0,845.26
Petcoke,62.9775,62.9775,,62.9775,62.9775
Solar,32.269609,20.0,63.40189,1.0,1000.0


In [6]:

from scipy import stats
import numpy as np

# Ensure there are at least two fuel types to compare
if df[fuel_type_col].nunique() < 2:
  print("Not enough fuel types to perform hypothesis testing.")
else:
  # Get the unique fuel types
  fuel_types = df[fuel_type_col].unique()

  # Perform ANOVA test if there are more than two fuel types
  if len(fuel_types) > 2:
    # Create a list of power output data for each fuel type
    data_by_fuel = [
        df[df[fuel_type_col] == fuel][power_output_col].dropna()
        for fuel in fuel_types
    ]

    # Perform one-way ANOVA test
    f_statistic, p_value = stats.f_oneway(*data_by_fuel)

    print(f"\nANOVA Test for Mean Power Output across Fuel Types:")
    print(f"F-statistic: {f_statistic}")
    print(f"P-value: {p_value}")

    # Interpret the results
    alpha = 0.05
    if p_value < alpha:
      print(
          "The p-value is less than alpha (0.05)."
          " Reject the null hypothesis."
      )
      print(
          "Conclusion: There is a statistically significant difference in mean"
          " power output between different fuel types."
      )
    else:
      print("The p-value is greater than alpha (0.05). Fail to reject the"
            " null hypothesis.")
      print(
          "Conclusion: There is no statistically significant difference in mean"
          " power output between different fuel types."
      )
  # Perform independent samples t-test if there are exactly two fuel types
  elif len(fuel_types) == 2:
    fuel_type1_data = df[
        df[fuel_type_col] == fuel_types[0]
    ][power_output_col].dropna()
    fuel_type2_data = df[
        df[fuel_type_col] == fuel_types[1]
    ][power_output_col].dropna()

    # Perform independent samples t-test
    t_statistic, p_value = stats.ttest_ind(
        fuel_type1_data, fuel_type2_data, equal_var=False
    )  # Assuming unequal variances

    print(
        f"\nIndependent Samples t-test for Mean Power Output between"
        f" {fuel_types[0]} and {fuel_types[1]}:"
    )
    print(f"T-statistic: {t_statistic}")
    print(f"P-value: {p_value}")

    # Interpret the results
    alpha = 0.05
    if p_value < alpha:
      print(
          "The p-value is less than alpha (0.05)."
          " Reject the null hypothesis."
      )
      print(
          "Conclusion: There is a statistically significant difference in mean"
          f" power output between {fuel_types[0]} and {fuel_types[1]}."
      )
    else:
      print("The p-value is greater than alpha (0.05). Fail to reject the"
            " null hypothesis.")
      print(
          "Conclusion: There is no statistically significant difference in mean"
          f" power output between {fuel_types[0]} and {fuel_types[1]}."
      )



ANOVA Test for Mean Power Output across Fuel Types:
F-statistic: 274.89413760194014
P-value: 0.0
The p-value is less than alpha (0.05). Reject the null hypothesis.
Conclusion: There is a statistically significant difference in mean power output between different fuel types.


In [8]:

!pip install folium

import folium

# Drop rows with missing latitude or longitude
df_geo = df.dropna(subset=['latitude', 'longitude']).copy()

# Create a base map centered around the mean coordinates
mean_lat = df_geo['latitude'].mean()
mean_lon = df_geo['longitude'].mean()
m = folium.Map(location=[mean_lat, mean_lon], zoom_start=4)

# Add markers for each power plant
for index, row in df_geo.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"{row['name']}<br>Fuel Type: {row['primary_fuel']}<br>Capacity: {row['capacity_mw']} MW",
        tooltip=row['name']
    ).add_to(m)

# Display the map
m



KeyboardInterrupt: 