In [None]:
import pandas as pd
from word2number import w2n
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from category_encoders import CountEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np  # Import numpy

# Step 2: Load the Train and Test Datasets
train_data = pd.read_csv('train.csv')  # Replace with your actual training data file
test_data = pd.read_csv('test.csv')  # Replace with your actual test data file

# Display the first few rows of the train dataset
train_data.head()

# Step 3: Define a Function to Convert Words to Numbers  (No Words in Your Data, so commenting out)
#def convert_to_float(value):
#    try:
#        num = float(value)  # Convert directly if it's a number
#    except ValueError:
#        num = float(w2n.word_to_num(value))  # Convert word to number
#    return abs(num)  # Remove negative sign

# Step 4: Preprocess the `column` in Train and Test Data (Assuming you have a column named 'column' to split)
# Adjust this to your actual column preprocessing
def preprocess_column(df):
    # If the column exists, perform the split and conversion
    if 'column' in df.columns:
        # Split into two columns based on multiple delimiters (;, :, ,, _)
        df[['col1', 'col2']] = df['column'].str.split(r'[;:,_]\s*', expand=True)

        # Convert col1 and col2 values to float and remove negative signs
        #df['col1'] = df['col1'].apply(convert_to_float) #Commented out as it caused issues for the problem
        df['col1'] = df['col1'].astype(float).abs() #Changing this since conversion function doesn't fit problem
        df['col2'] = df['col2'].astype(float).abs()  # Convert col2 to float and remove negative signs

        # Drop the original column
        df.drop(columns=['column'], inplace=True)
    else:
        print("'column' not found in the dataframe.  Skipping preprocessing.")
    return df

# Preprocess the train and test datasets
train_data = preprocess_column(train_data)
test_data = preprocess_column(test_data)

# Display the processed train dataset
train_data.head()

# Step 5: Separate Features and Target
X_train = train_data.drop(columns=['label'])  # Assuming 'label' is the column name for the target
y_train = train_data['label']

X_test = test_data  # Drop 'label' if it exists in test_data (Test data shouldn't have label)

# Display the first few rows of X_train
X_train.head()

# Step 6: Automatically Identify Numerical and Categorical Features
numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

# Step 7: Create a Preprocessing Pipeline
def create_column_transformer(numerical_features, categorical_features):
    # Preprocessing for numerical features
    num_transformer = make_pipeline(
        SimpleImputer(strategy='median'),  # Impute missing values with median
        RobustScaler()  # Scale numerical features
    )
    
    # Preprocessing for categorical features
    #  Frequency Encoding and OneHotEncoding are less suited for regression tasks.
    # Consider removing/adjusting these steps based on your data.
    if categorical_features: #added condition to ensure the function works if no categorical features exist
        freq_encoder = CountEncoder(normalize=True)  # Frequency Encoding
        one_hot_encoder = OneHotEncoder(handle_unknown='ignore')  # One-Hot Encoding

        cat_transformer = make_column_transformer(
            (freq_encoder, categorical_features),  # Frequency Encoding for categorical features
            (one_hot_encoder, categorical_features),  # One-Hot Encoding for categorical features
            remainder="drop"  # Drop columns not explicitly transformed
        )

        column_transformer = make_column_transformer(
            (num_transformer, numerical_features),  # Apply numerical transformer to numerical features
            (cat_transformer, categorical_features),  # Apply categorical transformer to categorical features
            remainder="passthrough"  # Use passtrough to avoid dropping columns not involved in the process
        )
    else: # If there are no categorical features
        column_transformer = make_column_transformer(
        (num_transformer, numerical_features),  # Apply numerical transformer to numerical features
        remainder="passthrough"  # Use passthrough to avoid dropping columns not involved in the process
        )
    
    return column_transformer

# Initialize the column transformer
column_transformer = create_column_transformer(numerical_features, categorical_features)

# Step 8: Transform the Training and Test Data
input_features_train_array = column_transformer.fit_transform(X_train)
print("Encoded and Scaled Train dataset:")
pd.DataFrame(input_features_train_array).head()

input_features_test_array = column_transformer.transform(X_test)
print("Encoded and Scaled Test dataset:")
pd.DataFrame(input_features_test_array).head()

# Step 9: Define and Evaluate Models (Changed to regression models and r2_score)
# Choosing RandomForestRegressor as the single model
model = RandomForestRegressor(random_state=42)

# Evaluate model using cross-validation
cv_scores = cross_val_score(model, input_features_train_array, y_train, cv=5, scoring='r2')
mean_cv_score = cv_scores.mean()
print(f'Random Forest Cross-Validation R2 Score: {mean_cv_score:.4f}')

# Step 10: Train the Best Model and Make Predictions
model.fit(input_features_train_array, y_train)

test_predictions = model.predict(input_features_test_array)

submission = pd.DataFrame({'label': test_predictions})  # Assuming label is the target
submission.to_csv('submissions.csv', index=False)
print("Submission file has been saved successfully.")


# Step 11: Feature Importance

importances = model.feature_importances_
features = X_train.columns
feature_importances = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importances = feature_importances.sort_values('Importance', ascending=False).head(10) # Display top 10
print("\nFeature Importances:")
print(feature_importances)

# Plotting
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importances')
plt.show()