# Generalized Linear Model

In [1]:
# If additional packages are needed but are not installed by default, uncomment the last two lines of this cell
# and replace <package list> with a list of additional packages.
# This will ensure the notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [1]:
# Libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm

pd.set_option("display.max_columns", 101)
pd.set_option('display.max_colwidth', 100)

## Data Description

Column | Description
:---|:---
`feat_0` | Feature 0
`feat_1` | Feature 1
`feat_2` | Feature 2
`feat_3` | Feature 3
`feat_4` | Feature 4
`feat_5` | Feature 5
`feat_6` | Feature 6
`label`  | Label

In [2]:
# The information dataset for the training set is already loaded below
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,label
0,355.0,34.3973,15.91837,14.70646,59.15732,4.445207,50.009913,0.560099
1,40.0,17.36507,13.63636,16.08324,59.50397,5.267598,83.26087,0.782609
2,234.0,32.64324,28.83436,14.59559,60.56992,5.482922,51.019264,0.590193
3,178.0,11.90953,11.11111,14.38939,58.33411,4.165093,64.935428,0.689354
4,57.0,36.88889,43.58974,13.90568,63.15364,4.324902,16.307692,0.123077


In [3]:
data.isna().sum()

feat_0    0
feat_1    0
feat_2    0
feat_3    0
feat_4    0
feat_5    0
feat_6    0
label     0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   feat_0  284 non-null    float64
 1   feat_1  284 non-null    float64
 2   feat_2  284 non-null    float64
 3   feat_3  284 non-null    float64
 4   feat_4  284 non-null    float64
 5   feat_5  284 non-null    float64
 6   feat_6  284 non-null    float64
 7   label   284 non-null    float64
dtypes: float64(8)
memory usage: 17.9 KB


## Machine Learning

Build a GLM that can predict the label.
- **The model's performance will be evaluated on the basis of R-squared.**

In [9]:
col_names = ['feat_0','feat_1', 'feat_2', 'feat_3','feat_4','feat_5','feat_6']
formula = 'label ~ feat_0 + feat_1 + feat_2 + feat_3 + feat_4 + feat_5 + feat_6'

model = smf.glm(formula=formula, data=data, family=sm.families.Binomial()).fit()

In [13]:
model.pvalues

Intercept    0.276536
feat_0       0.932139
feat_1       0.580894
feat_2       0.956091
feat_3       0.907029
feat_4       0.948074
feat_5       0.852023
feat_6       0.002803
dtype: float64

In [15]:
model.summary()

0,1,2,3
Dep. Variable:,label,No. Observations:,284.0
Model:,GLM,Df Residuals:,276.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-116.39
Date:,"Sat, 19 Jun 2021",Deviance:,2.5442
Time:,18:22:44,Pearson chi2:,2.52
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.0729,1.905,-1.088,0.277,-5.807,1.661
feat_0,6.859e-06,8.05e-05,0.085,0.932,-0.000,0.000
feat_1,-0.0062,0.011,-0.552,0.581,-0.028,0.016
feat_2,-0.0008,0.014,-0.055,0.956,-0.029,0.027
feat_3,0.0092,0.079,0.117,0.907,-0.146,0.164
feat_4,0.0019,0.029,0.065,0.948,-0.055,0.059
feat_5,0.0423,0.227,0.187,0.852,-0.403,0.487
feat_6,0.0367,0.012,2.989,0.003,0.013,0.061


> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    Submit a CSV file with a header row plus each of the test entries, each on its own line. 

The file (`submissions.csv`) should have exactly 1 column:

Column | Description
:---|:---
`label`  | Label

In [16]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6
0,637.0,21.01388,11.39742,15.15302,63.10153,4.172242,65.026667
1,17.0,55.9322,0.0,12.756,48.85596,6.406862,49.263158
2,420.0,57.00249,23.48066,14.00443,52.15526,4.382061,31.913043
3,889.0,69.00829,63.43907,15.01983,63.04099,3.74833,12.981998
4,154.0,23.35644,6.617647,13.59359,49.51303,4.638098,52.98324


In [17]:
preds = model.predict(test)

In [18]:
submission_df = pd.DataFrame({'label':preds})

In [19]:
submission_df

Unnamed: 0,label
0,0.648998
1,0.467634
2,0.298052
3,0.161023
4,0.534509
5,0.516332
6,0.533015
7,0.372849
8,0.437824
9,0.365212


In [20]:
#Submission
submission_df.to_csv('submissions.csv', index=False)