In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_excel("data-train.xlsx")
validation = pd.read_excel("data-valid.xlsx")
test = pd.read_excel("data-test-without.xlsx")

In [3]:
# rename columns
train = train.rename(columns = {'Weight\nin Kg': 'weight', 'Height\nin Cm': 'height'})
validation = validation.rename(columns = {'Weight\nin Kg': 'weight', 'Height\nin Cm': 'height'})
test = test.rename(columns = {'Weight\nin Kg': 'weight', 'Height\nin Cm': 'height'})

In [4]:
# Encode gender column
train['Encode_Gender'] = train['Gender'].map({'Male': 0, 'Female': 1})
train.head()

Unnamed: 0,Gender,weight,height,Encode_Gender
0,Female,69.4,166,1
1,Female,63.0,167,1
2,Male,73.7,167,0
3,Female,59.1,163,1
4,Female,70.9,172,1


In [5]:
# Encode gender column
validation['Encode_Gender'] = validation['Gender'].map({'Male': 0, 'Female': 1})
validation.head()

Unnamed: 0,Gender,weight,height,Encode_Gender
0,Female,49.0,156,1
1,Male,76.2,167,0
2,Male,89.2,178,0
3,Female,71.6,165,1
4,Female,60.1,165,1


In [6]:
X_train = train[['weight','height']]
Y_train = train[['Encode_Gender']]

In [12]:
# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42)
# # Train the model on the training data
model.fit(X_train, Y_train.values.ravel())

In [13]:
X_Val = validation[['weight', 'height']]
Y_Val = validation[['Encode_Gender']]
# Validate the model on the validation set
y_val_pred = model.predict(X_Val)
# Evaluate the model on the validation set
accuracy = accuracy_score(Y_Val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2%}')

Validation Accuracy: 91.30%


## LogisticRegressionCV

In [20]:
# Initialize the Logistic Regression model
#8, 12
modelCV = LogisticRegressionCV(cv = 8,random_state=42)
# Train the model on the training data
modelCV.fit(X_train, Y_train.values.ravel())
# validation data
X_Val = validation[['weight', 'height']]
Y_Val = validation[['Encode_Gender']]
# Validate the model on the validation set
y_val_pred = modelCV.predict(X_Val)
# Evaluate the model on the validation set
accuracy = accuracy_score(Y_Val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2%}')

Validation Accuracy: 91.40%


## Adding New Feature

In [21]:
train['Body_Mass_Index']= train['weight'] /((train['height'] / 100) )**2
validation['Body_Mass_Index']= validation['weight'] /((validation['height'] / 100) )**2
test['Body_Mass_Index']= validation['weight'] /((validation['height'] / 100) )**2

In [23]:
X_train = train[['weight','height', 'Body_Mass_Index']]
Y_train = train[['Encode_Gender']].values.ravel()

X_Val = validation[['weight','height', 'Body_Mass_Index']]
Y_Val = validation[['Encode_Gender']].values.ravel()

X_test = test[['weight','height', 'Body_Mass_Index']]

## Fist Model

In [24]:
model = LogisticRegressionCV(cv=10, random_state=42)

# Train the model on the training data
model.fit(X_train, Y_train)

# Evaluate the model on the validation set
accuracy = accuracy_score(Y_Val, y_val_pred)

print(f'Validation Accuracy: {accuracy:.2%}')

Validation Accuracy: 91.40%


##  Logistic Regression using Hyperparameters

In [25]:
logModel = LogisticRegression()

In [26]:
# Hyperparameters
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [27]:
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 8, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, Y_train)
best_clf.best_estimator_
print (f'Accuracy: {best_clf.score(X_train, Y_train):.2%}')

Fitting 8 folds for each of 1600 candidates, totalling 12800 fits
Accuracy: 92.05%


5760 fits failed out of a total of 12800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
640 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py"

In [28]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

logModel = LogisticRegression()
param_grid = [    
    {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
     'C': np.logspace(-4, 4, 20),
     'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
     'max_iter': [100, 1000, 2500, 5000]
    }
]

X_train = train[['weight', 'height', 'Body_Mass_Index']]
Y_train = train['Encode_Gender'].values.ravel()

X_Val = validation[['weight', 'height', 'Body_Mass_Index']]
Y_Val = validation['Encode_Gender'].values.ravel()

X_test = test[['weight', 'height', 'Body_Mass_Index']]

# Use cv=1 or cv=None to fit on the entire dataset
clf = GridSearchCV(logModel, param_grid=param_grid, cv=4, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, Y_train)

# Get the best estimator
best_estimator = best_clf.best_estimator_

# Now you can use the best model for predictions
#y_test_pred = best_estimator.predict(X_test)

# Get the best estimator
best_estimator = best_clf.best_estimator_

# Validate the best model on the validation set
y_val_pred = best_estimator.predict(X_Val)

# Evaluate the best model on the validation set
accuracy = accuracy_score(Y_Val, y_val_pred)
print(f'Validation Accuracy: {accuracy:.2%}')

Fitting 4 folds for each of 1600 candidates, totalling 6400 fits
Validation Accuracy: 91.30%


2880 fits failed out of a total of 6400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py",

### Increasing Cross-Validation  

In [None]:
clf = GridSearchCV(logModel, param_grid = param_grid, cv = 6, verbose=True, n_jobs=-1)
best_clf = clf.fit(X_train, Y_train)
best_clf.best_estimator_
print (f'Accuracy: {best_clf.score(X_train, Y_train):.2%}')

Fitting 6 folds for each of 1600 candidates, totalling 9600 fits


4320 fits failed out of a total of 9600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sohil\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py",

In [None]:
best_clf.best_estimator_

In [None]:
print (f'Accuracy: {best_clf.score(X_train, Y_train):.2%}')

Accuracy: 91.99%


In [None]:
accuracy_validation = best_clf.score(X_Val, Y_Val)
print(f'Accuracy on Validation Set: {accuracy_validation:.2%}')

Accuracy on Validation Set: 91.40%


In [26]:
# Check the shape of X_train and X_test
print("X_train shape:", X_train.shape)
print("X_test shape:", Y_train.shape)
print("X_train shape:", X_Val.shape)
print("X_test shape:", Y_Val.shape)
print("X_train shape:", X_test.shape)


X_train shape: (8700, 3)
X_test shape: (8700,)
X_train shape: (1000, 3)
X_test shape: (1000,)
X_train shape: (100, 3)


In [27]:
X_test = test[['weight', 'height', 'Body_Mass_Index']]
# Use the trained model to predict labels for the new data
y_test_pred = best_clf.best_estimator_.predict(X_test)

# Now 'y_pred' contains the predicted labels for the test data
y_test_pred

array([0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0], dtype=int64)

In [28]:
I apologize for the confusion. It seems like the error is related to the DataFrame object. In this case, you can convert the DataFrame column to a numpy array before using `ravel()`. Here's the updated code:

```python
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score

# Set up the parameter grid for grid search
param_grid = {'Cs': [1, 10, 150], 'cv': [3, 5, 15]}

# Initialize LogisticRegressionCV model
model = LogisticRegressionCV()

# Initialize GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=10)

# Train the model on the training data
grid_search.fit(X_train, Y_train['Encode_Gender'].values.ravel())  # Convert DataFrame column to numpy array

# Get the best parameters from grid search
best_params = grid_search.best_params_

# Initialize a new LogisticRegressionCV model with the best parameters
best_model = LogisticRegressionCV(Cs=best_params['Cs'], cv=best_params['cv'])

# Train the best model on the training data
best_model.fit(X_train, Y_train['Encode_Gender'].values.ravel())

# Validate the best model on the validation set
y_val_pred = best_model.predict(X_Val)

# Evaluate the best model on the validation set
accuracy = accuracy_score(Y_Val['Encode_Gender'], y_val_pred)
print(f'Validation Accuracy: {accuracy:.2%}')
```

This should resolve the `ravel()` issue.

SyntaxError: unterminated string literal (detected at line 1) (3559494672.py, line 1)