In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Prepare our data

In [None]:
train_path = "../input/tabular-playground-series-aug-2021/train.csv"
train = pd.read_csv(train_path, index_col="id")

In [None]:
train.info()

In [None]:
X = train.drop("loss", axis=1)
y = train.loss
feature_names = X.columns.to_list()

In [None]:
from sklearn.preprocessing import StandardScaler

standizer = StandardScaler()
X = standizer.fit_transform(X)

In [None]:
train.select_dtypes("int").nunique()

# Fit a Poisson Regression model to the data

In [None]:
import statsmodels.api as sm

exog = sm.add_constant(X)
poission_model = sm.GLM(y, exog, family=sm.families.Poisson())
result = poission_model.fit()

In [None]:
result.summary()

# Make prediction on the test set

In [None]:
test_path = "../input/tabular-playground-series-aug-2021/test.csv"
test = pd.read_csv(test_path, index_col="id")
test.head()

In [None]:
test.shape

In [None]:
from sklearn.model_selection import KFold

X_test = sm.add_constant(standizer.transform(test))
prediction_test = []

kfold = KFold(shuffle=True, random_state=42)
for train_idx, valid_idx in kfold.split(X, y):
    
    features = sm.add_constant(X[train_idx])
    
    model = sm.GLM(
        y[train_idx], 
        features, 
        family=sm.families.Poisson()
    )
    result = model.fit()
    
    prediction_test.append(result.predict(X_test))

In [None]:
preds = np.column_stack(prediction_test).mean(axis=1)

In [None]:
sub = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")
sub["loss"] = preds
sub.to_csv("submission.csv", index=False)