
## Multiple linear regression with Statsmodels

### GLM modelling using statsmodels package:

```
import statsmodels.api as sm
import pandas as pd
import numpy as np

# Sample data
np.random.seed(0)
data = pd.DataFrame({
    'x1': np.random.normal(size=100),
    'x2': np.random.normal(size=100),
    'y': np.random.poisson(lam=2, size=100)
})

# Add a constant to the predictor variables
data['const'] = 1

# Define the GLM model
model = sm.GLM(data['y'], data[['const', 'x1', 'x2']], family=sm.families.Poisson())

# Fit the model
results = model.fit()

# Print the summary of the model
print(results.summary())

```

### Time series modelling
The statsmodels package has good GLM and timeseries modelling

```
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv'
data = pd.read_csv(url, parse_dates=['Month'], index_col='Month')

# Plot the dataset
plt.figure(figsize=(10, 6))
plt.plot(data)
plt.title('Monthly International Airline Passengers')
plt.xlabel('Date')
plt.ylabel('Passengers')
plt.show()

```

```
from statsmodels.tsa.arima.model import ARIMA

# Fit an ARMA model (ARIMA with no differencing)
model = ARIMA(data, order=(2, 0, 2))
model_fit = model.fit()

# Summary of the model
print(model_fit.summary())

```
The order parameter is set to (2, 0, 2), which means:

p=2: The model includes the last two lagged values of the time series.
d=0: The model does not use the first difference of the time series to make it stationary.
q=2: The model includes the last two lagged forecast errors.

```
# Forecast the next 12 months
forecast = model_fit.forecast(steps=12)
plt.figure(figsize=(10, 6))
plt.plot(data, label='Original')
plt.plot(forecast, label='Forecast', color='red')
plt.title('ARMA Model Forecast')
plt.xlabel('Date')
plt.ylabel('Passengers')
plt.legend()
plt.show()


```



### Mixed effects modelling
y = Xbeta + Zu + error
```
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

# Sample data
data = pd.DataFrame({
    'group': [1, 1, 2, 2, 3, 3],
    'x': [1, 2, 3, 4, 5, 6],
    'y': [2, 4, 6, 8, 10, 12]
})

model = smf.mixedlm("y ~ x", data, groups=data["group"])
result = model.fit()
print(result.summary())

```

### Logistics regression for image discrimination

```
import cv2
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Function to load images and convert them to grayscale
def load_images_from_folder(folder, label):
    images = []
    labels = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename))
        if img is not None:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (64, 64))  # Resize to 64x64
            images.append(resized.flatten())  # Flatten the image
            labels.append(label)
    return images, labels

# Load cat images
cat_images, cat_labels = load_images_from_folder('path_to_cat_images', 0)

# Load dog images
dog_images, dog_labels = load_images_from_folder('path_to_dog_images', 1)

# Combine the data
X = np.array(cat_images + dog_images)
y = np.array(cat_labels + dog_labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


```


### Reduced Major Axis (RMA) regression

Reduced Major Axis (RMA) regression, also known as Geometric Mean Regression, 
is used when both Y and X have variations (i.e. X values are not known exactly)
is not directly available in scikit-learn. However, you can implement it using 
numpy and scipy libraries. Here's an example:
```
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Sample data
np.random.seed(0)
X = np.random.rand(100) * 10
y = 2.5 * X + np.random.randn(100) * 2

# Calculate the means
X_mean = np.mean(X)
y_mean = np.mean(y)

# Calculate the standard deviations
X_std = np.std(X)
y_std = np.std(y)

# Calculate the correlation coefficient
r = np.corrcoef(X, y)[0, 1]

# Calculate the slope and intercept for RMA
slope = y_std / X_std * np.sign(r)
intercept = y_mean - slope * X_mean

# Predicted values
y_pred = slope * X + intercept

# Plotting
plt.scatter(X, y, color='blue', alpha=0.5, label='Data')
plt.plot(X, y_pred, color='red', label='RMA Regression Line')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.show()
```

### Reduced Major Axis (RMA) regression with multiple predictor variables:

```
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Sample data
np.random.seed(0)
X1 = np.random.rand(100) * 10
X2 = np.random.rand(100) * 5
y = 2.5 * X1 + 1.5 * X2 + np.random.randn(100) * 2

# Combine X1 and X2 into a single matrix
X = np.vstack((X1, X2)).T

# Calculate the means
X_mean = np.mean(X, axis=0)
y_mean = np.mean(y)

# Calculate the standard deviations
X_std = np.std(X, axis=0)
y_std = np.std(y)

# Calculate the correlation matrix
R = np.corrcoef(X.T, y)[:-1, -1]

# Calculate the slopes for RMA
slopes = y_std / X_std * np.sign(R)

# Calculate the intercept
intercept = y_mean - np.dot(slopes, X_mean)

# Predicted values
y_pred = np.dot(X, slopes) + intercept

# Plotting
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X1, X2, y, color='blue', alpha=0.5, label='Data')
ax.plot_trisurf(X1, X2, y_pred, color='red', alpha=0.5, label='RMA Regression Plane')
ax.set_xlabel('X1')
ax.set_ylabel('X2')
ax.set_zlabel('y')
plt.legend()
plt.show()
```


### Best subsets regression

```
pip install abess
```
Courtesy of Copilot:
```
import numpy as np
import pandas as pd
from abess.linear import LinearRegression

# Generate synthetic data
np.random.seed(0)
X = np.random.randn(100, 10)
beta = np.array([1.5, -2, 0, 0, 0, 3, 0, 0, 0, 0])
y = X @ beta + np.random.randn(100) * 0.5

# Fit the best subset selection model
model = LinearRegression(support_size=3)  # support_size is the number of non-zero coefficients
model.fit(X, y)

# Get the selected features and coefficients
selected_features = model.coef_ != 0
print("Selected features:", selected_features)
print("Coefficients:", model.coef_)


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming y_true are the actual values and y_pred are the predicted values
y_true = np.array([...])  # Replace with actual values
y_pred = np.array([...])  # Replace with predicted values

# Calculate metrics
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R^2):", r2)


```


### Recording scalling factors of variables for later use in predisction

Sklearn example of saving and using scale factors for normailisation of variables

```
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import joblib
import numpy as np

# Simulated training data
X_train = np.random.rand(100, 3)
y_train = np.random.rand(100)

# Initialize and fit the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Train a model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Save the model
joblib.dump(model, 'model.pkl')

# Later, for prediction tasks
# Load the scaler and model
scaler = joblib.load('scaler.pkl')
model = joblib.load('model.pkl')

# Simulated new data
X_new = np.random.rand(5, 3)

# Scale the new data
X_new_scaled = scaler.transform(X_new)

# Make predictions
predictions = model.predict(X_new_scaled)
print(predictions)

```

### Other scalers in ScikitLearn:

```
from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler, QuantileTransformer, PowerTransformer

# robust to outliers (median and interquartile range).
scaler = RobustScaler() 
# Print the center (median) and scale (IQR)
print("Center (Median):", scaler.center_)
print("Scale (IQR):", scaler.scale_)

# Scales features to a specified range, typically [0, 1]
scaler = MinMaxScaler()
# Print the data min and data max
print("Data Min:", scaler.data_min_)
print("Data Max:", scaler.data_max_)

# Scales each feature by its maximum absolute value, preserving the sign
scaler = MaxAbsScaler()
print("Max Absolute Values:", scaler.max_abs_)

# Transforms features to follow a uniform or normal distribution.
scaler = QuantileTransformer(output_distribution='normal')

# Applies a power transformation to make data more Gaussian-like
scaler = PowerTransformer(method='yeo-johnson')


# Scale along samples(rows)
from sklearn.preprocessing import Normalizer
# each sample has unit norm (e.g., L2 norm).
scaler = Normalizer()

```
