In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('data/train.csv')
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [3]:
train.isnull().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
Age               0
dtype: int64

In [4]:
train.shape

(74051, 10)

In [5]:
X = train.drop(['id','Age'],axis=1)
y = train['Age']

In [6]:
X.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Weight            float64
Shucked Weight    float64
Viscera Weight    float64
Shell Weight      float64
dtype: object

In [7]:
X = pd.get_dummies(X,drop_first=True)
X.head()

Unnamed: 0,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M
0,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,1,0
1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,1,0
2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,0,1
3,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,0,0
4,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,1,0


In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 0.72123811,  0.63398203,  0.29239968, ...,  0.45337609,
         1.44602779, -0.75938146],
       [-0.75571183, -0.84035601, -0.79416309, ..., -0.92678786,
         1.44602779, -0.75938146],
       [ 0.24340136,  0.37070738,  0.29239968, ..., -0.01722423,
        -0.69154964,  1.31686122],
       ...,
       [ 0.590919  ,  0.73929189,  0.69986072, ...,  0.37823822,
        -0.69154964, -0.75938146],
       [-0.36475449, -0.26115178, -0.38670205, ..., -0.68160114,
         1.44602779, -0.75938146],
       [-1.40730739, -1.47221517, -1.60908517, ..., -1.4013428 ,
         1.44602779, -0.75938146]])

In [9]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [10]:
model_0 = lr.fit(X,y)

In [11]:
test = pd.read_csv('data/test.csv')

In [12]:
test.isnull().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Weight            0
Shucked Weight    0
Viscera Weight    0
Shell Weight      0
dtype: int64

In [13]:
test = pd.get_dummies(test,drop_first=True)
test.head()

Unnamed: 0,id,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Sex_I,Sex_M
0,74051,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552,1,0
1,74052,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893,1,0
2,74053,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415,0,0
3,74054,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676,0,0
4,74055,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066,1,0


In [14]:
test = test.drop('id',axis=1)

In [15]:
test = scaler.transform(test)
test

array([[-0.92947065, -1.10363066, -0.79416309, ..., -1.11660983,
         1.44602779, -0.75938146],
       [-0.53851331, -0.57708136, -0.79416309, ..., -0.76860288,
         1.44602779, -0.75938146],
       [-0.10411627, -0.15584192, -0.2508817 , ..., -0.53132541,
        -0.69154964, -0.75938146],
       ...,
       [ 0.590919  ,  0.5813271 ,  0.15657934, ...,  0.57596945,
        -0.69154964, -0.75938146],
       [-0.27787508, -0.31380671, -0.65834274, ..., -0.49177916,
        -0.69154964, -0.75938146],
       [ 1.19907486,  1.16053133,  0.97150142, ...,  1.05052439,
        -0.69154964,  1.31686122]])

In [16]:
y_pred = model_0.predict(X)

In [20]:
from sklearn.metrics import mean_absolute_error

mae_0 = mean_absolute_error(y,np.round(y_pred))
mae_0

1.4597237039337754

In [19]:
sample_submission_df = pd.read_csv('data/sample_submission.csv')
sample_submission_df['Age'] = model_0.predict(test)
sample_submission_df.to_csv('data/submission0.csv', index=False)
sample_submission_df.head()

Unnamed: 0,id,Age
0,74051,7.735472
1,74052,7.682365
2,74053,10.432966
3,74054,9.556131
4,74055,7.504141
