In [1]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import altair as alt


import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

plt.rcParams['svg.fonttype'] = 'none'


In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Loading required package: tidyverse


In [4]:
# Correctly loading the CSV file now that we've confirmed the structure and the delimiter
data = pd.read_csv('merged_data.csv', skiprows=1)  # Skipping the first row which is just 'merged_data'

# Confirming the correct data structure this time
data.head()

Unnamed: 0,Month,2023_temp,2022_temp,2021_temp,2020_temp,2023_humid,2022_humid,2021_humid,2020_humid,date,class,focuses,uf,2020_rain,2021_rain,2022_rain,2023_rain
0,4.0,26.4,25.7,26.0,26.2,87.6,90.3,89.1,88.2,2023/04,Desmatamento Consolidado,8.0,AMAZONAS,974.8,1115.5,1240.2,349.6
1,4.0,26.4,25.7,26.0,26.2,87.6,90.3,89.1,88.2,2023/04,Desmatamento Consolidado,9.0,MARANHÃO,974.8,1115.5,1240.2,349.6
2,4.0,26.4,25.7,26.0,26.2,87.6,90.3,89.1,88.2,2023/04,Desmatamento Consolidado,271.0,MATO GROSSO,974.8,1115.5,1240.2,349.6
3,4.0,26.4,25.7,26.0,26.2,87.6,90.3,89.1,88.2,2023/04,Desmatamento Consolidado,37.0,PARÁ,974.8,1115.5,1240.2,349.6
4,4.0,26.4,25.7,26.0,26.2,87.6,90.3,89.1,88.2,2023/04,Desmatamento Consolidado,15.0,RONDÔNIA,974.8,1115.5,1240.2,349.6


In [7]:
%%R

# Extract year and month from the 'date' column using lubridate
data <- mutate(data, month = month(ymd(date, truncated = 2)), year = year(ymd(date, truncated = 2)))

# Assuming your other variables are properly formatted and named, transforming data to long format
data_long <- gather(data, key = "variable", value = "value", -focuses, -uf, -class, -date, -month, -year)

# Check the output of separate to ensure it's doing what you expect
data_long <- separate(data_long, variable, into = c("measure"), sep = "_", extra = "drop")

# Using pivot_wider to handle duplicates
data_wide <- pivot_wider(data_long, names_from = measure, values_from = value, values_fn = list(value = mean)) # Using mean to aggregate duplicates

# Convert 'year' and 'month' to a factor for regression analysis
data_wide$year <- as.factor(data_wide$year)
data_wide$month <- as.factor(data_wide$month)

# Fit a linear model with 'focuses' as the dependent variable including 'month' and 'year'
model <- lm(focuses ~ year + month, data = data_wide)
summary(model)



Call:
lm(formula = focuses ~ year + month, data = data_wide)

Residuals:
   Min     1Q Median     3Q    Max 
-709.5 -174.0  -52.3   11.6 7092.5 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -16.271     66.120  -0.246  0.80564    
year2020      53.260     52.549   1.014  0.31093    
year2021      -4.623     52.835  -0.087  0.93028    
year2022      80.621     52.496   1.536  0.12475    
year2023      35.780     52.282   0.684  0.49382    
year2024      90.182     82.149   1.098  0.27243    
month2         7.634     66.823   0.114  0.90906    
month3        12.451     67.483   0.185  0.85364    
month4         1.753     75.570   0.023  0.98150    
month5        19.881     70.272   0.283  0.77727    
month6        46.849     68.653   0.682  0.49506    
month7       117.316     67.907   1.728  0.08421 .  
month8       646.176     63.259  10.215  < 2e-16 ***
month9       613.853     62.905   9.758  < 2e-16 ***
month10      319.421     62.903   5.078 4.16

In [10]:
%%R
# Selecting predictors and the dependent variable, including additional years
data_selected <- select(data, focuses, `2020_temp`, `2020_humid`, `2020_rain`,
                                `2021_temp`, `2021_humid`, `2021_rain`,
                                `2022_temp`, `2022_humid`, `2022_rain`,
                                `2023_temp`, `2023_humid`, `2023_rain`)

# Checking for NA values and removing them if necessary
data_selected <- na.omit(data_selected)

# Linear regression with 'focuses' as the dependent variable
model <- lm(focuses ~ ., data = data_selected)

# Summary of the model to check coefficients and statistics
summary(model)


Call:
lm(formula = focuses ~ ., data = data_selected)

Residuals:
   Min     1Q Median     3Q    Max 
-662.9 -180.5  -42.7   -8.1 7139.1 

Coefficients: (1 not defined because of singularities)
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -18411.289   9181.261  -2.005 0.045062 *  
`2020_temp`     669.843    325.694   2.057 0.039846 *  
`2020_humid`     40.285     53.174   0.758 0.448776    
`2020_rain`      -1.937      1.540  -1.258 0.208601    
`2021_temp`    1147.698    383.641   2.992 0.002809 ** 
`2021_humid`    174.469     50.650   3.445 0.000584 ***
`2021_rain`       1.145      0.946   1.211 0.226184    
`2022_temp`       4.954    114.117   0.043 0.965375    
`2022_humid`    -88.688     65.027  -1.364 0.172758    
`2022_rain`       1.427      1.116   1.279 0.201167    
`2023_temp`   -1087.430    729.889  -1.490 0.136418    
`2023_humid`   -144.076     80.297  -1.794 0.072915 .  
`2023_rain`          NA         NA      NA       NA    
---
Signif. codes:  0