In [1]:
# Import libraries
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style

from sklearn.decomposition import PCA

import tensorflow as tf
from tensorflow import keras
from keras import metrics

In [2]:
# Read data
df = pd.read_csv('/Users/achen1/MIDS-W207-DeforestationAndClimate/data/train_monthly.csv', sep='\t')
print(df)

     Unnamed: 0     date  cdsd  cldd  dp01  dp10  dp1x  dsnd  dsnw  dt00  ...  \
0             0  2001-01   0.0   0.0  10.0   5.0   0.0  28.0   6.0   0.0  ...   
1             1  2001-02   0.0   0.0  13.0  10.0   1.0  28.0  10.0   0.0  ...   
2             2  2001-03   0.0   0.0   7.0   6.0   0.0  21.0   3.0   0.0  ...   
3             3  2001-04   0.0   0.0   8.0   6.0   0.0  12.0   6.0   0.0  ...   
4             4  2001-05   0.0   0.0   1.0   1.0   0.0   0.0   0.0   0.0  ...   
..          ...      ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
162         162  2014-08  45.8   5.3   5.0   2.0   0.0   0.0   0.0   0.0  ...   
163         163  2014-09  46.7   0.9   4.0   2.0   0.0   0.0   0.0   0.0  ...   
164         164  2014-10  46.7   0.0   4.0   2.0   0.0   0.0   0.0   0.0  ...   
165         165  2014-11  46.7   0.0   9.0   6.0   1.0   1.0   1.0   0.0  ...   
166         166  2014-12  46.7   0.0  10.0   9.0   1.0  14.0   4.0   0.0  ...   

     tc_loss_ha_Uruguay  tc

In [3]:
# Check for NaN
nan_counts = df.isna().sum(axis=1)
print(nan_counts)

# Columns with NaN
all_nan_columns = df.columns[df.isna().all()]
print(all_nan_columns)

0      6
1      6
2      6
3      6
4      6
      ..
162    6
163    6
164    6
165    6
166    6
Length: 167, dtype: int64
Index(['dyfg', 'dyts'], dtype='object')


In [4]:
print(df[[col for col in df.columns if 'average' in col]])

     carbon_average  tc_loss_ha_average
0      2.056476e+07        56587.483051
1      2.056476e+07        56587.483051
2      2.056476e+07        56587.483051
3      2.056476e+07        56587.483051
4      2.056476e+07        56587.483051
..              ...                 ...
162    4.359131e+07       100562.338983
163    4.359131e+07       100562.338983
164    4.359131e+07       100562.338983
165    4.359131e+07       100562.338983
166    4.359131e+07       100562.338983

[167 rows x 2 columns]


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,cdsd,cldd,dp01,dp10,dp1x,dsnd,dsnw,dt00,...,tc_loss_ha_Uruguay,tc_loss_ha_Uzbekistan,tc_loss_ha_Vanuatu,tc_loss_ha_Venezuela,tc_loss_ha_Vietnam,"tc_loss_ha_Virgin Islands, U.S.",tc_loss_ha_Zambia,tc_loss_ha_Zimbabwe,tc_loss_ha_Åland,tc_loss_ha_average
0,0,2001-01,0.0,0.0,10.0,5.0,0.0,28.0,6.0,0.0,...,7647,295,124,123881,47433,123,30124,8206,397,56587.483051
1,1,2001-02,0.0,0.0,13.0,10.0,1.0,28.0,10.0,0.0,...,7647,295,124,123881,47433,123,30124,8206,397,56587.483051
2,2,2001-03,0.0,0.0,7.0,6.0,0.0,21.0,3.0,0.0,...,7647,295,124,123881,47433,123,30124,8206,397,56587.483051
3,3,2001-04,0.0,0.0,8.0,6.0,0.0,12.0,6.0,0.0,...,7647,295,124,123881,47433,123,30124,8206,397,56587.483051
4,4,2001-05,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,7647,295,124,123881,47433,123,30124,8206,397,56587.483051


In [14]:
# Feature selection
tf.random.set_seed(1234)
np.random.seed(1234)


print(df.loc[:,['date','snow','carbon_average','tc_loss_ha_average']][:5])

df = df[['date','snow','carbon_average','tc_loss_ha_average']]
print(df2.head())


      date    snow  carbon_average  tc_loss_ha_average
0  2001-01   661.0    2.056476e+07        56587.483051
1  2001-02  1589.0    2.056476e+07        56587.483051
2  2001-03   178.0    2.056476e+07        56587.483051
3  2001-04   580.0    2.056476e+07        56587.483051
4  2001-05     0.0    2.056476e+07        56587.483051
              snow  carbon_average  tc_loss_ha_average
date                                                  
2001-01-01   661.0    2.056476e+07        56587.483051
2001-02-01  1589.0    2.056476e+07        56587.483051
2001-03-01   178.0    2.056476e+07        56587.483051
2001-04-01   580.0    2.056476e+07        56587.483051
2001-05-01     0.0    2.056476e+07        56587.483051


In [15]:
# Additional imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA

tf.random.set_seed(1234)
np.random.seed(1234)



In [16]:
tf.random.set_seed(1234)
np.random.seed(1234)

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month

X = df[['year', 'month', 'carbon_average', 'tc_loss_ha_average']]
y = df['snow']


In [17]:
tf.random.set_seed(1234)
np.random.seed(1234)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)


In [20]:
tf.random.set_seed(1234)
np.random.seed(1234)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)



Mean Squared Error: 348138.3478853962


In [21]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print('R-squared:', r_squared)

R-squared: 0.0547199334136127
