### Import Dataset

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
predict_outcome = pd.read_csv('../data/predict_outcome.csv')
nearterm_cleaned = pd.read_csv('../data/nearterm_cleaned.csv')


  from pandas.core import (


In [2]:
predict_outcome.head()

Unnamed: 0,long,lat,year,TimePeriod,RCP,scenario,DrySoilDays_Summer_whole,Evap_Summer,ExtremeShortTermDryStress_Summer_whole,FrostDays_Winter,...,PPT_Winter,PPT_Summer,T_Winter,T_Summer,Tmax_Summer,Tmin_Winter,VWC_Winter_whole,VWC_Spring_whole,VWC_Summer_whole,VWC_Fall_whole
0,-110.047,37.604,2021,NT,4.5,sc22,0.0,2.14072,35.247,69.0,...,2.67,4.48,2.131,24.55,35.95,-12.15,0.04704,0.0439,0.04214,0.09343
1,-110.047,37.604,2021,NT,4.5,sc23,0.0,2.14072,35.247,69.0,...,2.67,4.48,2.131,24.55,35.95,-12.15,0.04704,0.0439,0.04214,0.09343
2,-110.047,37.604,2021,NT,4.5,sc24,0.0,2.14072,35.247,69.0,...,2.67,4.48,2.131,24.55,35.95,-12.15,0.04704,0.0439,0.04214,0.09343
3,-110.047,37.604,2021,NT,4.5,sc25,0.0,2.14072,35.247,69.0,...,2.67,4.48,2.131,24.55,35.95,-12.15,0.04704,0.0439,0.04214,0.09343
4,-110.047,37.604,2021,NT,4.5,sc26,0.0,2.14072,35.247,69.0,...,2.67,4.48,2.131,24.55,35.95,-12.15,0.04704,0.0439,0.04214,0.09343


### Q1: Which Scenario Most Fits Prediction?

In [3]:
common_columns = ['long', 'lat', 'year', 'TimePeriod', 'RCP', 'scenario']
merged_data = pd.merge(predict_outcome, nearterm_cleaned, on=common_columns, suffixes=('_pred', '_actual'))

#cleaned_merged_data = merged_data.dropna()

prediction_columns = ['DrySoilDays_Summer_whole', 'Evap_Summer', 'ExtremeShortTermDryStress_Summer_whole', 
                      'FrostDays_Winter', 'PPT_Winter', 'PPT_Summer', 'T_Winter', 'T_Summer', 
                      'Tmax_Summer', 'Tmin_Winter', 'VWC_Winter_whole', 'VWC_Spring_whole', 
                      'VWC_Summer_whole', 'VWC_Fall_whole']

errors_by_scenario = {}
for column in prediction_columns:
    pred_column = column + '_pred'
    actual_column = column + '_actual'
    merged_data['error'] = (merged_data[actual_column] - merged_data[pred_column]) ** 2
    rmse_by_scenario = merged_data.groupby('scenario')['error'].mean().apply(np.sqrt)
    errors_by_scenario[column] = rmse_by_scenario
combined_errors = pd.DataFrame(errors_by_scenario)

combined_errors['mean_error'] = combined_errors.mean(axis=1)
lowest_error_scenario = combined_errors['mean_error'].idxmin()


In [4]:
import plotly.express as px

fig = px.bar(combined_errors.reset_index(), x='scenario', y='mean_error', title='Mean RMSE by Scenario',
             labels={'mean_error': 'Mean RMSE', 'scenario': 'Scenario'}, text='mean_error')

fig.add_vline(x=combined_errors.index.get_loc(lowest_error_scenario), line_width=3, line_dash="dash", line_color="green")

fig.show()


write up something

### Q2: How will temprature change in future 50 years based on senario 22?

In [5]:
historic_data = pd.read_csv('../data/historic_cleaned.csv')
nearterm_data = nearterm_cleaned[nearterm_cleaned['scenario'] == 'sc22']

combined_data = pd.concat([historic_data, nearterm_data], ignore_index=True)

In [6]:
from sklearn.linear_model import LinearRegression
historical_data = combined_data[combined_data['scenario'] == 'sc1']

models = {}
predictions = []

locations = historical_data[['long', 'lat']].drop_duplicates()

for _, location in locations.iterrows():
    long = location['long']
    lat = location['lat']
    
    location_data = historical_data[(historical_data['long'] == long) & (historical_data['lat'] == lat)]
    
    X = location_data[['year']].values
    y_winter = location_data['T_Winter'].values
    y_summer = location_data['T_Summer'].values
    
    model_winter = LinearRegression().fit(X, y_winter)
    model_summer = LinearRegression().fit(X, y_summer)
    
    models[(long, lat)] = {
        'winter': model_winter,
        'summer': model_summer
    }
    future_years = np.arange(2025, 2024 + 50).reshape(-1, 1)
    predicted_winter = model_winter.predict(future_years)
    predicted_summer = model_summer.predict(future_years)
    
    for year, winter_temp, summer_temp in zip(future_years.flatten(), predicted_winter, predicted_summer):
        predictions.append({
            'long': long,
            'lat': lat,
            'year': year,
            'T_Winter': winter_temp,
            'T_Summer': summer_temp
        })

predictions_df = pd.DataFrame(predictions)


In [7]:
fig = px.scatter_geo(
    predictions_df,
    lon='long',
    lat='lat',
    color='T_Summer',
    animation_frame='year',
    projection="natural earth",
    title='Yearly Predicted Temperatures (2024-2074)',
    range_color=[22,24],
    color_continuous_scale=px.colors.diverging.RdBu_r
)

fig.update_geos(
    showcountries=True,
    showcoastlines=True,
    showland=True,
    landcolor="lightgray",
    fitbounds="locations",
        lonaxis=dict(
        showgrid=True,
        gridwidth=1,
        range=[-180, 180],
        dtick=10
    ),
    lataxis=dict(
        showgrid=True,
        gridwidth=1,
        range=[-90, 90],
        dtick=10
    )
)
fig.update_layout(template='simple_white',geo=dict(landcolor='white'))
fig.show()


write up something

### Q3: Why does east part has lower temperature than the west part?

In [None]:
historic_data.head()
scatter_hist = px.scatter(
    historic_data,
    x='lat',
    y='Bare_percent',
    hover_name='lat',
    marginal_x='histogram',
    marginal_y='histogram',
    title='Scatter Plot with Marginal Histograms: Bare Ground Coverage Percent with Latitude',
    labels={'lat': 'Latitude', 'Bare_percent': 'Bare Ground Coverage Percent (%)'},
    trendline="ols",
    template='simple_white'
)
scatter_hist.show()

### Q4: What kind of plant helps prevent global warming most efficiently?

In [None]:
import plotly.figure_factory as ff
correlation_data = historical_data[['T_Summer', 'T_Winter', 'treecanopy_percent', 'Ann_Herb_percent', 
                                    'Herb_percent', 'Litter_percent', 'Shrub_percent']]

correlation_matrix = correlation_data.corr()

fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    annotation_text=correlation_matrix.round(2).values,
    colorscale='RdBu',  
    showscale=True
)

fig.update_layout(
    title='Correlation Matrix of Climate Factors and Vegetation Types',
    xaxis_title='Variables',
    yaxis_title='Variables',
    xaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.columns))), ticktext=list(correlation_matrix.columns)),
    yaxis=dict(tickmode='array', tickvals=list(range(len(correlation_matrix.index))), ticktext=list(correlation_matrix.index))
)

add some write up

### Q5: Will the temperature be more extreme in future 50 years?