# ML DS general workflow example  

## Step 1: Import Required Libraries
First, import all the necessary libraries.

In [None]:
import pandas as pd
from word2number import w2n
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from category_encoders import CountEncoder

## Step 2: Load the Train and Test Datasets
Load the `train.csv` and `test.csv` files into DataFrames.

In [None]:
# Load datasets
train_data = pd.read_csv('train.csv')  # Replace with your actual training data file
test_data = pd.read_csv('test.csv')  # Replace with your actual test data file

# Display the first few rows of the train dataset
display(train_data.head())

## Step 3: Define a Function to Convert Words to Numbers
Define a function `convert_to_float` to handle word-to-number conversion and remove negative signs.

In [None]:
# Function to convert numeric words to float and remove negative signs
def convert_to_float(value):
    try:
        num = float(value)  # Convert directly if it's a number
    except ValueError:
        num = float(w2n.word_to_num(value))  # Convert word to number
    return abs(num)  # Remove negative sign

## Step 4: Preprocess the column in Train and Test Data
Split the column into two new columns (col1 and col2), convert their values to floats, and remove negative signs.
##### Sample DataFrame
`data_col = {'column': ['-66.0; 2', 'sixty-two, -3', '-seventy-four: 4', '90_5']}`


`df_col = pd.DataFrame(data_col)`

In [None]:
# Function to preprocess the 'column'
def preprocess_column(df):
    # Split into two columns based on multiple delimiters (;, :, ,, _)
    df[['col1', 'col2']] = df['column'].str.split(r'[;:,_]\s*', expand=True)

    # Convert col1 and col2 values to float and remove negative signs
    df['col1'] = df['col1'].apply(convert_to_float)
    df['col2'] = df['col2'].astype(float).abs()  # Convert col2 to float and remove negative signs

    # Drop the original column
    df.drop(columns=['column'], inplace=True)
    return df

# Preprocess the train and test datasets
train_data = preprocess_column(train_data)
test_data = preprocess_column(test_data)

# Display the processed train dataset
train_data.head()

## Step 5: Separate Features and Target
Separate the features (X) and target (y) for the training dataset. For the test dataset, only features are needed.

In [None]:
# Separate Features (X) and Target (y) for train dataset
X_train = train_data.drop(columns=['Target'])  # Assuming 'Target' is the column name for the target
y_train = train_data['Target']

# Separate Features (X) for test dataset
X_test = test_data.drop(columns=['Target'], errors='ignore')  # Drop 'Target' if it exists in test_data

# Display the first few rows of X_train
X_train.head()

## Step 6: Automatically Identify Numerical and Categorical Features
Identify numerical and categorical features in the dataset.

In [None]:
# Automatically select numerical and categorical columns
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")


## Step 7: Create a Preprocessing Pipeline
Create a preprocessing pipeline for numerical and categorical features.

In [None]:
# Function to create a column transformer
def create_column_transformer(numerical_features, categorical_features):
    # Preprocessing for numerical features
    num_transformer = make_pipeline(
        SimpleImputer(strategy='median'),  # Impute missing values with median
        RobustScaler()  # Scale numerical features
    )
    
    # Preprocessing for categorical features
    freq_encoder = CountEncoder(normalize=True)  # Frequency Encoding
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')  # One-Hot Encoding

    cat_transformer = make_column_transformer(
        (freq_encoder, categorical_features),  # Frequency Encoding for categorical features
        (one_hot_encoder, categorical_features),  # One-Hot Encoding for categorical features
        remainder="drop"  # Drop columns not explicitly transformed
    )

    column_transformer = make_column_transformer(
        (num_transformer, numerical_features),  # Apply numerical transformer to numerical features
        (cat_transformer, categorical_features),  # Apply categorical transformer to categorical features
        remainder="drop"  # Drop columns not explicitly transformed
    )
    
    return column_transformer

# Initialize the column transformer
column_transformer = create_column_transformer(numerical_features, categorical_features)

## Step 8: Transform the Training and Test Data
Apply the preprocessing pipeline to the training and test datasets.

In [None]:
# Fit and transform the training data
input_features_train_array = column_transformer.fit_transform(X_train).toarray()
print("Encoded and Scaled Train dataset:")
pd.DataFrame(input_features_train_array).head()

# Transform the test data
input_features_test_array = column_transformer.transform(X_test).toarray()
print("Encoded and Scaled Test dataset:")
pd.DataFrame(input_features_test_array).head()

## Step 9: Define and Evaluate Models
Define three models (RandomForest, LogisticRegression, and XGBoost) and evaluate them using cross-validation.

In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}

# Evaluate models using cross-validation
results = {}
for name, model in models.items():
    cv_scores = cross_val_score(model, input_features_train_array, y_train, cv=5, scoring='accuracy')
    mean_cv_score = cv_scores.mean()
    results[name] = mean_cv_score
    print(f'{name} Cross-Validation Accuracy: {mean_cv_score:.4f}')

## Step 10: Train the Best Model and Make Predictions
Select the best model, train it on the full training set, and make predictions on the test set.

In [None]:
# Select the best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

# Train the best model on the full training set
best_model.fit(input_features_train_array, y_train)

# Make predictions on the test set
test_predictions = best_model.predict(input_features_test_array)

# Create a submission DataFrame
submission = pd.DataFrame({'id': test_data['id'], 'outcome': test_predictions})

# Save the submission file
submission.to_csv('submissions.csv', index=False)

## Step 11: Hyperparameter Tuning (Optional)
If you want to tune the hyperparameters of a specific model (e.g., RandomForest), you can do so as follows:

In [None]:
# Hyperparameter tuning for RandomForest
tuned_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
tuned_model.fit(input_features_train_array, y_train)

# Make predictions with the tuned model
tuned_predictions = tuned_model.predict(input_features_test_array)

# Evaluate accuracy (if true labels are available)
accuracy = accuracy_score(y_test, tuned_predictions)
print(f'Tuned Random Forest Accuracy: {accuracy:.4f}')