<a href="https://colab.research.google.com/github/rfdornelles/mds_ML_project/blob/main/baseline_linear_regression_karon2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/rfdornelles/mds_ML_project/main/data/karon2.csv")

In [2]:
df.head()

Unnamed: 0,year,country,temp,population,qnt_death_heat_cold_exposure,temp_diff
0,1991,Albania,6.219891,3266790,5,0.050951
1,1992,Albania,6.28493,3247039,5,0.06504
2,1993,Albania,6.324316,3227287,5,0.039385
3,1994,Albania,6.357706,3207536,5,0.03339
4,1995,Albania,6.402805,3187784,6,0.045099


In [3]:
df.columns

Index(['year', 'country', 'temp', 'population', 'qnt_death_heat_cold_exposure',
       'temp_diff'],
      dtype='object')

In [4]:
X = df.drop(["qnt_death_heat_cold_exposure", "country"], axis = 1)
y = df["qnt_death_heat_cold_exposure"]
X.head()

Unnamed: 0,year,temp,population,temp_diff
0,1991,6.219891,3266790,0.050951
1,1992,6.28493,3247039,0.06504
2,1993,6.324316,3227287,0.039385
3,1994,6.357706,3207536,0.03339
4,1995,6.402805,3187784,0.045099


In [5]:
from sklearn.model_selection import train_test_split

# We need to set the name of the 4 objects that will be created
# As arguments we add our X and our y, it will separate it into 4 
# What are those 4 groups that we need? 

#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30)

# We can move the size of the test size
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20)

In [6]:
# Let's build a simple classification pipeline. First import pipeline

from sklearn.pipeline import Pipeline

# Our data has different dimesions, so let's scale it. Import Scaler

from sklearn.preprocessing import StandardScaler

# Now let's chose our model 

from sklearn.linear_model import LinearRegression

# Two step pipeline. First preprocess the data with the Standard Scaler,
# then add the model, in this case a Logistic Regression 

pipe = Pipeline([('scaler', StandardScaler()),     # Step 1
                 ('model', LinearRegression())     # Step 2
                 ])

In [7]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('model', LinearRegression())])

In [8]:
pred = pipe.predict(X_test)

In [9]:
# CROSS VALIDATION

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

cv = KFold(n_splits = 5, random_state = 7, shuffle = True)

scores = cross_val_score(pipe, X_train, y_train, cv = cv, n_jobs = -1)

In [10]:
scores

array([0.18058597, 0.16564681, 0.17676849, 0.09373092, 0.16747166])

In [11]:
from sklearn.metrics import r2_score, mean_squared_error


R2 = r2_score(y_test, pred)


n = X.shape[0]
p = X.shape[1]
Adj_r2 = 1-(1-R2)*(n-1)/(n-p-1)

rmse = mean_squared_error(y_test, pred, squared = False)
mse = mean_squared_error(y_test, pred, squared = True)

print("coef:", pipe['model'].coef_)
print("intercept:", pipe['model'].intercept_)

print("R2:", R2)
print("AdjR2:", Adj_r2)
print("RMSE:", rmse)
print("MSE:", mse)




coef: [-19.21209408 -97.295012   247.33095264  38.60868589]
intercept: 182.96766743648985
R2: 0.17539829023497766
AdjR2: 0.17336097903111103
RMSE: 492.0574640058648
MSE: 242120.54788388294
