## Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MAPE


import statsmodels.api as sm

## Load Data

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')

## Data Exploration

In [3]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [4]:
train.describe()

Unnamed: 0,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
count,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0,90615.0
mean,45307.0,0.517098,0.401679,0.135464,0.789035,0.340778,0.169422,0.225898,9.696794
std,26158.441658,0.118217,0.098026,0.038008,0.457671,0.204428,0.100909,0.130203,3.176221
min,0.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,22653.5,0.445,0.345,0.11,0.419,0.1775,0.0865,0.12,8.0
50%,45307.0,0.545,0.425,0.14,0.7995,0.33,0.166,0.225,9.0
75%,67960.5,0.6,0.47,0.16,1.0675,0.463,0.2325,0.305,11.0
max,90614.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB


In [6]:
train = pd.get_dummies(train, columns=['Sex'], prefix='Sex', drop_first=True)
test = pd.get_dummies(test, columns=['Sex'], prefix='Sex', drop_first=True)

In [7]:
train[['Sex_M', 'Sex_I']] = train[['Sex_M', 'Sex_I']].astype(int)
test[['Sex_M', 'Sex_I']] = test[['Sex_M', 'Sex_I']].astype(int)

In [8]:
columns = ['Length', 'Diameter', 'Height', 'Whole weight', 'Sex_M', 'Sex_I']
y = train['Rings']
#X = train.drop(['id', 'Rings'], axis=1)
X = train[columns]
X_tests = test[columns]
#X_tests = test.drop(['id', ], axis=1)
X = sm.add_constant(X)
X_tests = sm.add_constant(X_tests)

In [9]:
X

Unnamed: 0,const,Length,Diameter,Height,Whole weight,Sex_M,Sex_I
0,1.0,0.550,0.430,0.150,0.7715,0,0
1,1.0,0.630,0.490,0.145,1.1300,0,0
2,1.0,0.160,0.110,0.025,0.0210,0,1
3,1.0,0.595,0.475,0.150,0.9145,1,0
4,1.0,0.555,0.425,0.130,0.7820,0,1
...,...,...,...,...,...,...,...
90610,1.0,0.335,0.235,0.075,0.1585,1,0
90611,1.0,0.555,0.425,0.150,0.8790,1,0
90612,1.0,0.435,0.330,0.095,0.3215,0,1
90613,1.0,0.345,0.270,0.075,0.2000,0,1


## Split Data Set

In [10]:
X_train, X_test_test, y_train, y_test_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Train Model

In [11]:
X

Unnamed: 0,const,Length,Diameter,Height,Whole weight,Sex_M,Sex_I
0,1.0,0.550,0.430,0.150,0.7715,0,0
1,1.0,0.630,0.490,0.145,1.1300,0,0
2,1.0,0.160,0.110,0.025,0.0210,0,1
3,1.0,0.595,0.475,0.150,0.9145,1,0
4,1.0,0.555,0.425,0.130,0.7820,0,1
...,...,...,...,...,...,...,...
90610,1.0,0.335,0.235,0.075,0.1585,1,0
90611,1.0,0.555,0.425,0.150,0.8790,1,0
90612,1.0,0.435,0.330,0.095,0.3215,0,1
90613,1.0,0.345,0.270,0.075,0.2000,0,1


In [12]:
# Fit the multiple linear regression model
model = sm.OLS(y_train, X_train).fit()

In [13]:
# Display the summary of the regression model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Rings   R-squared:                       0.461
Model:                            OLS   Adj. R-squared:                  0.461
Method:                 Least Squares   F-statistic:                 1.034e+04
Date:                Sun, 19 Jan 2025   Prob (F-statistic):               0.00
Time:                        21:42:30   Log-Likelihood:            -1.6407e+05
No. Observations:               72492   AIC:                         3.281e+05
Df Residuals:                   72485   BIC:                         3.282e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            3.7811      0.078     48.295   

In [14]:
y_train_pred = model.predict(X_train)

In [15]:
def myf(y, yhat):
  ME = np.round(np.mean(y-yhat), 10)
  MPE = np.round(np.mean((y-yhat)/y), 3)
  myMAE = np.round(MAE(y, yhat), 3)
  myMSE = np.round(MSE(y, yhat), 3)
  myMAPE = np.round(MAPE(y, yhat), 3)
  print(f'\tME: {ME}\n\tMPE: {MPE}\n\tMAE: {myMAE}\n\tMSE: {myMSE}\n\tMAPE: {myMAPE}')

In [16]:
myf(y_train,y_train_pred)

	ME: 0.0
	MPE: -0.044
	MAE: 1.606
	MSE: 5.412
	MAPE: 0.16


## Making the actual predictions

In [17]:
y_test_preds = model.predict(X_tests)
submission = pd.DataFrame({'id':test['id'], 'Rings':y_test_preds})
submission

Unnamed: 0,id,Rings
0,90615,10.517202
1,90616,11.218664
2,90617,9.953834
3,90618,11.232937
4,90619,7.970285
...,...,...
60406,151021,6.661290
60407,151022,10.533615
60408,151023,9.732570
60409,151024,12.760962


In [18]:
submission.to_csv('submission.csv', index=False)


In [19]:
submission.describe()

Unnamed: 0,id,Rings
count,60411.0,60411.0
mean,120820.0,9.705056
std,17439.297893,2.151302
min,90615.0,2.973309
25%,105717.5,8.0822
50%,120820.0,10.066266
75%,135922.5,11.2554
max,151025.0,47.929123
