<a href="https://colab.research.google.com/github/subramanya4shenoy/MachineLearningNbs/blob/main/Kaggle/Competitions/Kaggle_PSs03E16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# prompt: connect to a kaggle problem

!pip install kaggle

# Upload your kaggle.json file (contains API key and secret)
from google.colab import files
files.upload()

# Create the .kaggle directory and move the file
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/

# Set permissions for the file
!chmod 600 ~/kaggle.json

# Download the dataset (replace with the actual competition name and dataset)
!kaggle competitions download -c playground-series-s3e16 # example competition name

# Unzip the downloaded dataset
!unzip playground-series-s3e16.zip # example dataset file name




Saving kaggle.json to kaggle.json
chmod: cannot access '/root/kaggle.json': No such file or directory
Downloading playground-series-s3e16.zip to /content
  0% 0.00/2.65M [00:00<?, ?B/s]
100% 2.65M/2.65M [00:00<00:00, 224MB/s]
Archive:  playground-series-s3e16.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [15]:
### import the requiered libraries for Feature engineering
import pandas as pd
import numpy as np

## Reading the data into a data frame
* Identify non significant columns
* Identify the independant and dependant columns (target)
* check the data type of the columns and identify the preprocessing needed for every columns

In [60]:
data = pd.read_csv('train.csv', index_col='id')

In [18]:
data.columns

Index(['id', 'Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight',
       'Viscera Weight', 'Shell Weight', 'Age'],
      dtype='object')

In [19]:
data.shape

(74051, 10)

In [20]:
data.dtypes

Unnamed: 0,0
id,int64
Sex,object
Length,float64
Diameter,float64
Height,float64
Weight,float64
Shucked Weight,float64
Viscera Weight,float64
Shell Weight,float64
Age,int64


* Label encoder for sex
* looks like scalar encoder for others (first thought)

## Checking for the missing values

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74051 entries, 0 to 74050
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              74051 non-null  int64  
 1   Sex             74051 non-null  object 
 2   Length          74051 non-null  float64
 3   Diameter        74051 non-null  float64
 4   Height          74051 non-null  float64
 5   Weight          74051 non-null  float64
 6   Shucked Weight  74051 non-null  float64
 7   Viscera Weight  74051 non-null  float64
 8   Shell Weight    74051 non-null  float64
 9   Age             74051 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 5.6+ MB


In [22]:
data.nunique()

Unnamed: 0,0
id,74051
Sex,3
Length,144
Diameter,122
Height,65
Weight,3096
Shucked Weight,1766
Viscera Weight,967
Shell Weight,1048
Age,28


In [23]:
data['Sex'].values.reshape(-1, 1)

array([['I'],
       ['I'],
       ['M'],
       ...,
       ['F'],
       ['I'],
       ['I']], dtype=object)

In [24]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder();
onehot_encoded = onehot_encoder.fit_transform(data['Sex'].values.reshape(-1, 1)).toarray()
onehot_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [25]:
onehot_encoder.get_feature_names_out(['Sex'])

array(['Sex_F', 'Sex_I', 'Sex_M'], dtype=object)

In [26]:
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(['Sex']))

In [27]:
data = pd.concat([data, onehot_encoded_df], axis=1)

In [28]:
data = data.drop('Sex', axis=1)

### Applying scaler encoding
* before that, lets split the data into independant and dependant
* split the train set and test set

In [29]:
X = data.drop(['Age'], axis=1)
y = data['Age']

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### applying scaler

In [31]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### saving all the encoder for future use

In [32]:
import pickle

with open('onehot_encoder_sex.pkl', 'wb') as file:
    pickle.dump(onehot_encoder, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Training ANN for regression

In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import tensorboard

In [34]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [35]:
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
model.summary()

In [36]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime

In [37]:
log_dir= 'logs/fit/'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir, histogram_freq=1)

In [38]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [39]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks = [early_stopping_callback, tensorboard_callback]
)

Epoch 1/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 2.4739 - mae: 2.4739 - val_loss: 1.4274 - val_mae: 1.4274
Epoch 2/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 1.4369 - mae: 1.4369 - val_loss: 1.4271 - val_mae: 1.4271
Epoch 3/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 1.4191 - mae: 1.4191 - val_loss: 1.4043 - val_mae: 1.4043
Epoch 4/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 1.4154 - mae: 1.4154 - val_loss: 1.4032 - val_mae: 1.4032
Epoch 5/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - loss: 1.3992 - mae: 1.3992 - val_loss: 1.3959 - val_mae: 1.3959
Epoch 6/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 1.4021 - mae: 1.4021 - val_loss: 1.3884 - val_mae: 1.3884
Epoch 7/100
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [40]:
model.save('reg_model_1.h5')



+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# **Trying out new models with hyper param tuning**

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [41]:
!pip install scikeras



In [42]:
from scikeras.wrappers import KerasRegressor

In [43]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
import pickle

In [44]:
## dynamically creating a model
def create_model(neurons=32, layers=1):
    model = Sequential()    #took the Ann
    model.add(Dense(neurons, activation='relu', input_shape=(X_train.shape[1],)))   #initializing the first layer

    for _ in range(layers):
        model.add(Dense(neurons, activation="relu")) #In-between layers

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
    return model

In [47]:
param_grid = {
    'neurons': [32, 64, 128],
    'layers': [1, 2],
    'epochs': [50, 100]
}

In [46]:
model = KerasRegressor(layers=1, neurons=32, build_fn=create_model, verbose=1)

In [48]:
# perform grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Epoch 1/50


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 9.0090 - mae: 9.0090
Epoch 2/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 8.9514 - mae: 8.9514
Epoch 3/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 8.9756 - mae: 8.9756
Epoch 4/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 8.9587 - mae: 8.9587
Epoch 5/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 8.9892 - mae: 8.9892
Epoch 6/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 8.9771 - mae: 8.9771
Epoch 7/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 8.9771 - mae: 8.9771
Epoch 8/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 8.9774 - mae: 8.9774
Epoch 9/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s

-------------------
From Above lets train our model
--------------------


In [50]:
new_model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [52]:
new_model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
new_model.summary()

In [53]:
log_dir= 'logs/fit/'+datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir, histogram_freq=1)

In [54]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [55]:
history = new_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    callbacks = [early_stopping_callback, tensorboard_callback]
)

Epoch 1/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - loss: 3.0655 - mae: 3.0655 - val_loss: 1.4277 - val_mae: 1.4277
Epoch 2/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4281 - mae: 1.4281 - val_loss: 1.4116 - val_mae: 1.4116
Epoch 3/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 1.4266 - mae: 1.4266 - val_loss: 1.4023 - val_mae: 1.4023
Epoch 4/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 1.3974 - mae: 1.3974 - val_loss: 1.3997 - val_mae: 1.3997
Epoch 5/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.4011 - mae: 1.4011 - val_loss: 1.4072 - val_mae: 1.4072
Epoch 6/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 1.4051 - mae: 1.4051 - val_loss: 1.3971 - val_mae: 1.3971
Epoch 7/50
[1m1852/1852[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[

In [56]:
new_model.save('reg_model_1.h5')



+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# **Regenerating the results**
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

In [57]:
model_name = 'reg_model_1.h5'

In [61]:
new_model = tf.keras.models.load_model(model_name) # make sure to change

test_df = pd.read_csv('test.csv', index_col='id')
sub = pd.DataFrame(columns=['id', 'Age'])

# Applying all my encoders to test
onehot_encoded_test_df = onehot_encoder.fit_transform(test_df['Sex'].values.reshape(-1, 1)).toarray()
onehot_encoded_df_test_df = pd.DataFrame(onehot_encoded_test_df, columns=onehot_encoder.get_feature_names_out(['Sex']))
test_df = pd.concat([test_df, onehot_encoded_df_test_df], axis=1)
test_df = test_df.drop('Sex', axis=1)

sub['id'] = test_df['id']
test_df=test_df.drop('id', axis=1)

test_df = scaler.transform(test_df)



KeyError: 'id'

In [40]:
### prediction block
predictions = new_model.predict(test_df)
prediction_flat = predictions.flatten()
sub['Age'] = prediction_flat

[1m1543/1543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step


In [42]:
### Submission block
sub.to_csv(f"""submission_{model_name}.csv""", index=False)
print(f"""Submission saved in submission_{model_name}.csv""")

Submission saved in submission_reg_model_0.h5.csv
