<a href="https://colab.research.google.com/github/saheedniyi02/Saheed-articles-codes/blob/main/Crab_age_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import numpy as np
import pandas as pd

In [3]:
train=pd.read_csv("/content/drive/MyDrive/Playground/train.csv",index_col=0)
test=pd.read_csv("/content/drive/MyDrive/Playground/test.csv",index_col=0)

## Data cleaning

In [4]:
#check for missing values
print(train.isna().sum())
print(test.isna().sum())

Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
dtype: int64


In [5]:
#Separate the target column and drop it from the train dataframe
target=train["Age"]
train.drop("Age",axis=1,inplace=True)

In [6]:
#Check for categorical columns
print(train.dtypes)
print(test.dtypes)

Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
dtype: object
Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
dtype: object


In [7]:
#Check the unique values in the Sex column
print(train["Sex"].value_counts())

M    27084
I    23957
F    23010
Name: Sex, dtype: int64


In [8]:
train["Sex"]=train["Sex"].map({"M":0,"I":1,"F":2})
test["Sex"]=test["Sex"].map({"M":0,"I":1,"F":2})

In [9]:
print(train.shape,test.shape)

(74051, 8) (49368, 8)


In [10]:
#splitting the train data into [X and val]
from sklearn.model_selection import train_test_split
X,val,y,y_val=train_test_split(train,target,test_size=0.15,random_state=0)

In [11]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

### Catboost

In [12]:
#import the regressor
from catboost import CatBoostRegressor

#create the model
model_catboost=CatBoostRegressor(verbose=0,random_state=0)

#fit the model on the X,y data
model_catboost.fit(X,y)

#predictions on the X,val and test dataframe
pred_x_catboost=model_catboost.predict(X)
pred_val_catboost=model_catboost.predict(val)
pred_test_catboost=model_catboost.predict(test)

#import mean_absolute_error
from sklearn.metrics import mean_absolute_error

#check the mean absolute error of the validation and X dataframe
print(mean_absolute_error(pred_val_catboost,y_val,))
print(mean_absolute_error(pred_x_catboost,y))

1.3934372745397485
1.3182671076008703


In [13]:
#creating a submision file
sub_catboost=pd.DataFrame({"id":test.index,"Age":pred_test_catboost}).set_index("id")
sub_catboost.to_csv("submission_catboost.csv")

### Lightgbm

In [14]:
model_lightgbm=LGBMRegressor(random_state=0)
model_lightgbm.fit(X,y)
pred_x_lightgbm=model_lightgbm.predict(X)
pred_val_lightgbm=model_lightgbm.predict(val)
pred_test_lightgbm=model_lightgbm.predict(test)
print(mean_absolute_error(pred_val_lightgbm,y_val,))
print(mean_absolute_error(pred_x_lightgbm,y))

1.392688087431402
1.3565696388540662


In [15]:
#creating a submisiion file
sub_lightboost=pd.DataFrame({"id":test.index,"Age":pred_test_lightgbm}).set_index("id")
sub_lightboost.to_csv("submission_lgb.csv")

### XGBoost

In [16]:
model_xgboost=XGBRegressor(random_state=0)
model_xgboost.fit(X,y)
pred_x_xgboost=model_xgboost.predict(X)
pred_val_xgboost=model_xgboost.predict(val)
pred_test_xgboost=model_xgboost.predict(test)
print(mean_absolute_error(pred_val_xgboost,y_val))
print(mean_absolute_error(pred_x_xgboost,y))

1.4087671341980195
1.2560569550215572


In [17]:
#creating a submision file
sub_xgboost=pd.DataFrame({"id":test.index,"Age":pred_test_xgboost}).set_index("id")
sub_xgboost.to_csv("submission_xgb.csv")

## Mean

In [18]:
mean_predictions=(pred_test_catboost+pred_test_lightgbm+pred_test_xgboost)/3

#create a submission file
sub_mode=pd.DataFrame({"id":test.index,"Age":mean_predictions}).set_index("id")
sub_mode.to_csv("sub_mean.csv")

## voting

In [19]:
#import voting regressor
from sklearn.ensemble import VotingRegressor#,StackingRegressor,BaggingRegressor

#sub_models
estimators=[("catboost",model_catboost),
           ("lightboost",model_lightgbm),
           ("xgboost",model_xgboost)]
model_voting=VotingRegressor(estimators=estimators)
model_voting.fit(X,y)
pred_x_voting=model_voting.predict(X)
pred_val_voting=model_voting.predict(val)
pred_test_voting=model_voting.predict(test)
print(mean_absolute_error(pred_val_voting,y_val))
print(mean_absolute_error(pred_x_voting,y))

1.389825948313698
1.303598908807795


In [20]:
sub_voting=pd.DataFrame({"id":test.index,"Age":pred_test_voting}).set_index("id")
sub_voting.to_csv("submission_voting.csv")

## Stacking

In [21]:
#Build a dataframe of the model's predictions
X_predictions_dataframe=pd.DataFrame({"catboost":pred_x_catboost,
                             "lightboost":pred_x_lightgbm,
                             "xgboost":pred_x_xgboost})
test_predictions_dataframe=pd.DataFrame({"catboost":pred_test_catboost,
                             "lightboost":pred_test_lightgbm,
                             "xgboost":pred_test_xgboost})
val_predictions_dataframe=pd.DataFrame({"catboost":pred_val_catboost,
                             "lightboost":pred_val_lightgbm,
                             "xgboost":pred_val_xgboost})

In [22]:
from sklearn.linear_model import LinearRegression

final_model=LinearRegression()
final_model.fit(X_predictions_dataframe,y)
pred_x_final=final_model.predict(X_predictions_dataframe)
pred_val_final=final_model.predict(val_predictions_dataframe)
pred_test_final=final_model.predict(test_predictions_dataframe)
print(mean_absolute_error(pred_val_final,y_val))
print(mean_absolute_error(pred_x_final,y))

1.4605724666855617
1.228132885830056


In [23]:
sub_final=pd.DataFrame({"id":test.index,"Age":pred_test_final}).set_index("id")
sub_final.to_csv("submission_final.csv")

## sklearn stacking

In [24]:
from sklearn.ensemble import StackingRegressor

model_stack=StackingRegressor(estimators=estimators,final_estimator=final_model)
model_stack.fit(X,y)
pred_x_stack=model_stack.predict(X)
pred_val_stack=model_stack.predict(val)
pred_test_stack=model_stack.predict(test)
print(mean_absolute_error(pred_val_stack,y_val))
print(mean_absolute_error(pred_x_stack,y))

1.3887467695231086
1.3220347992279269


In [25]:
sub_stack=pd.DataFrame({"id":test.index,"Age":pred_test_stack}).set_index("id")
sub_stack.to_csv("submission_stack.csv")

**KFold**

In [26]:
from sklearn.model_selection import KFold

folds=KFold(n_splits=8,shuffle=True,random_state=0)
predictions_df=pd.DataFrame()
mae_val=[]
mae_X=[]
model=CatBoostRegressor(verbose=0,random_state=0)
for i,(train_index,test_index) in enumerate(folds.split(train,target)):
    train_fold=train.iloc[train_index]
    val_fold=train.iloc[test_index]
    y_fold=target.iloc[train_index]
    y_val_fold=target.iloc[test_index]
    model.fit(train_fold,y_fold)
    print(i)
    prediction=model.predict(test)
    predictions_df[i]=prediction
    mae_val.append(mean_absolute_error(model.predict(val_fold),y_val_fold))
    mae_X.append(mean_absolute_error(model.predict(train_fold),y_fold))
print(mae_val)
print(mae_X)

0
1
2
3
4
5
6
7
[1.3922511761786873, 1.3909964991332031, 1.3934623088549272, 1.4107690728085112, 1.4151607187555988, 1.3847063919516438, 1.4042467715806202, 1.4264112538946645]
[1.3199484306562965, 1.3191876700977663, 1.3171518826156348, 1.3125035449401412, 1.3125368920323623, 1.3203393612128353, 1.3174578797588379, 1.3122300529158002]


In [27]:
predictions=predictions_df.mean(axis=1)
sub_kfold_mean=pd.DataFrame({"id":test.index,"Age":predictions}).set_index("id")
sub_kfold_mean.to_csv("submission_kfold_mean.csv")

In [28]:
predictions=predictions_df.median(axis=1)
sub_kfold_median=pd.DataFrame({"id":test.index,"Age":predictions}).set_index("id")
sub_kfold_median.to_csv("submission_kfold_median.csv")

## Bagging

In [None]:
from sklearn.ensemble import BaggingRegressor

model_bagging=BaggingRegressor(estimator=CatBoostRegressor(random_state=0,verbose=0),n_estimators=8,max_samples=0.8)
model_bagging.fit(train,target)

pred_test_bagging=model_bagging.predict(test)

In [None]:
sub_bagging=pd.DataFrame({"id":test.index,"Age":pred_test_bagging}).set_index("id")
sub_bagging.to_csv("submission_bagging.csv")