In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [3]:

# sub_df=pd.read_csv('/kaggle/input/playground-series-s3e16/sample_submission.csv')
train_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/train.csv')
test_df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/test.csv')

In [4]:
train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [5]:
train_df.isnull().any()

id                False
Sex               False
Length            False
Diameter          False
Height            False
Weight            False
Shucked Weight    False
Viscera Weight    False
Shell Weight      False
Age               False
dtype: bool

## handling the sex column

In [6]:
train_sex_col = pd.get_dummies(train_df['Sex'], prefix='Sex', prefix_sep='_', drop_first=True)
test_sex_col = pd.get_dummies(test_df['Sex'], prefix='Sex', prefix_sep='_', drop_first=True)

In [7]:
train_sex_col.head()

Unnamed: 0,Sex_I,Sex_M
0,1,0
1,1,0
2,0,1
3,0,0
4,1,0


In [8]:
train_df=pd.concat([train_df,train_sex_col],axis=1)
test_df=pd.concat([test_df,test_sex_col],axis=1)

    

In [9]:
train_df.drop(['id','Sex'],axis=1, inplace=True)
test_df.drop(['id','Sex'],axis=1, inplace=True)


In [10]:
train_df.head()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_I,Sex_M
0,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9,1,0
1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8,1,0
2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9,0,1
3,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11,0,0
4,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8,1,0


In [11]:
train_df.columns

Index(['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight', 'Age', 'Sex_I', 'Sex_M'],
      dtype='object')

## Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, chi2

numerical_features = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
                      'Viscera Weight', 'Shell Weight']

scaler = StandardScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])


In [13]:
train_df.head()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_I,Sex_M
0,0.721238,0.633982,0.2924,0.441804,0.467188,0.569186,0.453376,9,1,0
1,-0.755712,-0.840356,-0.794163,-1.025198,-0.993688,-0.97888,-0.926788,8,1,0
2,0.243401,0.370707,0.2924,0.110076,0.219924,0.178363,-0.017224,9,0,1
3,1.329394,1.634426,1.650603,2.156483,1.824616,2.124622,2.308095,11,0,0
4,-0.234435,-0.050532,-0.115061,-0.007598,0.333464,-0.197233,-0.214955,8,1,0


In [14]:
X = train_df.drop('Age', axis=1)  # Input features
y = train_df['Age']  # Target variable

# Perform feature selection using SelectKBest and f_regression (for regression task)
selector = SelectKBest(score_func=f_regression, k=6)  # Select top 5 features based on f_regression
X_selected = selector.fit_transform(X, y)

# Get the selected feature names
selected_feature_names = X.columns[selector.get_support()]

# Print the selected feature names
print("Selected features:")
for feature_name in selected_feature_names:
    print(feature_name)

Selected features:
Length
Diameter
Height
Weight
Viscera Weight
Shell Weight


# Modeling

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import mean_absolute_error

In [16]:
X=train_df.drop('Age', axis=1)
y=train_df['Age']

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=42)

In [18]:
#model function
def modeling(models):
    for model in models:
        regressor=model.__class__.__name__
        print(f"fitting {model}")
        model.fit(X_train,y_train)
        preds=model.predict(X_test)
        mae=mean_absolute_error(preds,y_test)
        print(f"MAE of {regressor} is {mae}")
    

In [19]:
regressors = [
    LinearRegression(),
    Ridge(),
    Lasso(),
    RandomForestRegressor(),

]


In [20]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(59240, 9)
(59240,)
(14811, 9)
(14811,)


In [21]:
# modeling(regressors)

In [28]:
svr=SVR()
model=svr.fit(X_train,y_train)
preds=model.predict(X_test)
mae=mean_absolute_error(preds,y_test)
mae
    

1.3819680043295206

# Prediction

**Based on the above result svr yeilds the better results**

In [32]:
test_df[numerical_features] = scaler.fit_transform(test_df[numerical_features])


In [33]:
test_df.head()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M
0,-0.940227,-1.115249,-0.798762,-1.174409,-1.153502,-1.197655,-1.125492,1,0
1,-0.548169,-0.58691,-0.798762,-0.629467,-0.552334,-0.654334,-0.776635,1,0
2,-0.112548,-0.164238,-0.256807,-0.703471,-0.815029,-0.425835,-0.538778,0,0
3,0.802256,-0.164238,0.420637,0.388656,0.579278,0.528784,0.079651,0,0
4,-0.722417,-0.745411,-0.934251,-0.925485,-0.820081,-0.933611,-0.955027,1,0


In [34]:

# Make predictions on the scaled test data using the best model
y_test_pred = model.predict(test_df)

# Print the predicted values
print("Predicted Age for Test Data:")
print(y_test_pred)

Predicted Age for Test Data:
[ 7.36829897  7.69959176 10.09080481 ... 12.08621319  9.81810818
 11.64396324]


In [30]:
sub=pd.read_csv(('/content/drive/MyDrive/Colab Notebooks/kaggle/sample_submission.csv'))
sub.head()

In [35]:
test_df.head()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M
0,-0.940227,-1.115249,-0.798762,-1.174409,-1.153502,-1.197655,-1.125492,1,0
1,-0.548169,-0.58691,-0.798762,-0.629467,-0.552334,-0.654334,-0.776635,1,0
2,-0.112548,-0.164238,-0.256807,-0.703471,-0.815029,-0.425835,-0.538778,0,0
3,0.802256,-0.164238,0.420637,0.388656,0.579278,0.528784,0.079651,0,0
4,-0.722417,-0.745411,-0.934251,-0.925485,-0.820081,-0.933611,-0.955027,1,0


In [36]:
test=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/kaggle/test.csv')


In [37]:
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,74051,I,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552
1,74052,I,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893
2,74053,F,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415
3,74054,F,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676
4,74055,I,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066


In [40]:
submission_df = pd.DataFrame({'id': test['id'], 'Age': y_test_pred})
submission_df['Age'] = submission_df['Age'].astype(int)


# Step 9: Save the submission dataframe to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [41]:
submission_df

Unnamed: 0,id,Age
0,74051,7
1,74052,7
2,74053,10
3,74054,9
4,74055,7
...,...,...
49363,123414,8
49364,123415,7
49365,123416,12
49366,123417,9
