<a href="https://colab.research.google.com/github/rahul-nauni/mlis1-poject/blob/main/arctic_ice_extent_linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing libraries
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [None]:
columns = ['year', ' extent']
columns_type = {'year': 'int64', 'extent': 'float64'}

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/N_01_extent_v3.0.csv",na_values = -9999.00,usecols=columns, dtype=columns_type)

In [None]:
df.head(10)

Unnamed: 0,year,extent
0,1979,15.41
1,1980,14.86
2,1981,14.91
3,1982,15.18
4,1983,14.94
5,1984,14.47
6,1985,14.72
7,1986,14.89
8,1987,14.97
9,1988,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   year     44 non-null     int64  
 1    extent  43 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 832.0 bytes


In [None]:
# remove special character from column names
df.columns = df.columns.str.replace(' ', '')

In [None]:
np.mean(df.loc[:, 'extent'])

14.207441860465115

In [None]:
def impute_nan(df):
  for col in df.columns:
    df.loc[:, col] = df.loc[:,col].fillna(round(np.mean(df.loc[:, col]),2))

In [None]:
impute_nan(df)

In [None]:
# treat 'year' as independent/predictor variable and 'extent' as target variable
X = df.loc[:,'year']
y = df.loc[:,'extent']

In [None]:
X

0     1979
1     1980
2     1981
3     1982
4     1983
5     1984
6     1985
7     1986
8     1987
9     1988
10    1989
11    1990
12    1991
13    1992
14    1993
15    1994
16    1995
17    1996
18    1997
19    1998
20    1999
21    2000
22    2001
23    2002
24    2003
25    2004
26    2005
27    2006
28    2007
29    2008
30    2009
31    2010
32    2011
33    2012
34    2013
35    2014
36    2015
37    2016
38    2017
39    2018
40    2019
41    2020
42    2021
43    2022
Name: year, dtype: int64

In [None]:
X.shape

(44,)

In [None]:
# reshaping X 
X = np.reshape(np.ravel(X), (X.size,1))

In [None]:
X

array([[1979],
       [1980],
       [1981],
       [1982],
       [1983],
       [1984],
       [1985],
       [1986],
       [1987],
       [1988],
       [1989],
       [1990],
       [1991],
       [1992],
       [1993],
       [1994],
       [1995],
       [1996],
       [1997],
       [1998],
       [1999],
       [2000],
       [2001],
       [2002],
       [2003],
       [2004],
       [2005],
       [2006],
       [2007],
       [2008],
       [2009],
       [2010],
       [2011],
       [2012],
       [2013],
       [2014],
       [2015],
       [2016],
       [2017],
       [2018],
       [2019],
       [2020],
       [2021],
       [2022]])

In [None]:
# appending 1's in first column which serves as bias or input to 'y intercept' in our regression model
# X_des_mat is our design matrix
X_des_mat = np.concatenate([np.ones_like(X, dtype='int64'), X])

In [None]:
X_des_mat = np.reshape(X_des_mat, (2,-1)).T

In [None]:
X_des_mat

array([[   1, 1979],
       [   1, 1980],
       [   1, 1981],
       [   1, 1982],
       [   1, 1983],
       [   1, 1984],
       [   1, 1985],
       [   1, 1986],
       [   1, 1987],
       [   1, 1988],
       [   1, 1989],
       [   1, 1990],
       [   1, 1991],
       [   1, 1992],
       [   1, 1993],
       [   1, 1994],
       [   1, 1995],
       [   1, 1996],
       [   1, 1997],
       [   1, 1998],
       [   1, 1999],
       [   1, 2000],
       [   1, 2001],
       [   1, 2002],
       [   1, 2003],
       [   1, 2004],
       [   1, 2005],
       [   1, 2006],
       [   1, 2007],
       [   1, 2008],
       [   1, 2009],
       [   1, 2010],
       [   1, 2011],
       [   1, 2012],
       [   1, 2013],
       [   1, 2014],
       [   1, 2015],
       [   1, 2016],
       [   1, 2017],
       [   1, 2018],
       [   1, 2019],
       [   1, 2020],
       [   1, 2021],
       [   1, 2022]])

We'll calculate weights for our model using following formula:

$ w = ( X^T X )^{-1} X^T y $ 

**note:** this method requires X to have full column rank i.e., columns of matrix X must be linearly independent

In [None]:
nrows, ncols = X_des_mat.shape

In [None]:
X_des_mat_rank = np.linalg.matrix_rank(X_des_mat)

In [None]:
# check if design matrix has full column rank
if nrows >= ncols == X_des_mat_rank:
  weights = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_des_mat.transpose(),X_des_mat)),X_des_mat.transpose()),y)
  weights = np.reshape(weights, (-1,1))
  print(weights)

[[ 9.78126103e+01]
 [-4.17921071e-02]]


In [None]:
X_new = np.array([2023,2025,2026, 2027,2150])

In [None]:
X_new.shape

(5,)

In [None]:
X_new = np.reshape(X_new, (-1,1))

In [None]:
X_new

array([[2023],
       [2025],
       [2026],
       [2027],
       [2150]])

In [None]:
X_new_des_mat = np.concatenate([np.ones_like(X_new, dtype='int64'), X_new])

In [None]:
X_new_des_mat

array([[   1],
       [   1],
       [   1],
       [   1],
       [   1],
       [2023],
       [2025],
       [2026],
       [2027],
       [2150]])

In [None]:
X_new_des_mat = np.reshape(X_new_des_mat, (2,-1)).transpose()

In [None]:
X_new_des_mat

array([[   1, 2023],
       [   1, 2025],
       [   1, 2026],
       [   1, 2027],
       [   1, 2150]])

In [None]:
predictions = np.matmul(X_new_des_mat, weights)

In [None]:
print(pd.DataFrame(data=predictions, columns=['extent predictions']))

   extent predictions
0           13.267178
1           13.183593
2           13.141801
3           13.100009
4            7.959580


In [None]:
class LinearRegression:

  #def __init__(self, X, y):
  #  self.X = X
  #  self.y = y


  # is.fullrank() will check if our design matrix is
  def isfullrank(X):
    pass
  # .fit() will add extra column of 1's to our design matrix X
  # then it will learn parameters either by using linear algebra 
  # or using gradient descent
  def fit(self, X, y):
    X = np.concatenate(X, np.ones_like(X), axis=1)
    nrows, ncols = X.shape
    print(ncols)

  def predict():
    pass

  def evaluate():
    pass