In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load dataset
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Data cleaning and preprocessing
def preprocess_data(data):
    # Handle missing values
    if data.isnull().sum().any():
        print("Missing values found. Handling missing values...")
        imputer = SimpleImputer(strategy='mean')  # Replace with mean for numerical columns
        data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    # Remove duplicates
    if data.duplicated().sum() > 0:
        print("Duplicates found. Removing duplicates...")
        data = data.drop_duplicates()

    # Check for outliers using IQR
    for col in data.select_dtypes(include=[np.number]).columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = data[(data[col] < lower_bound) | (data[col] > upper_bound)]
        if len(outliers) > 0:
            print(f"Outliers found in column {col}. Handling outliers...")
            data[col] = np.where(data[col] > upper_bound, upper_bound, np.where(data[col] < lower_bound, lower_bound, data[col]))

    # Remove skewness using log transformation
    for col in data.select_dtypes(include=[np.number]).columns:
        if data[col].skew() > 1 or data[col].skew() < -1:
            print(f"Skewness found in column {col}. Applying log transformation...")
            data[col] = np.log1p(data[col])

    # Encode categorical data
    categorical_cols = data.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print("Categorical columns found. Encoding categorical data...")
        data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

    # Scale numerical data
    scaler = StandardScaler()
    numerical_cols = data.select_dtypes(include=[np.number]).columns
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    return data

# Split data into training, validation, and test sets
def split_data(data, target_col):
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Train and evaluate regression models
def evaluate_regression_models(X_train, X_val, y_train, y_val):
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor()
    }
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        results[name] = mse
        print(f"{name} - MSE: {mse}")
    return results

# Train and evaluate classification models
def evaluate_classification_models(X_train, X_val, y_train, y_val):
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "SVC": SVC(),
        "KNN": KNeighborsClassifier()
    }
    results = {}
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred)
        recall = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        results[name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-Score": f1
        }
        print(f"{name} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-Score: {f1}")
    return results

# Check for overfitting or underfitting
def check_overfitting(model, X_train, X_test, y_train, y_test):
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    print(f"Training Accuracy: {train_score}")
    print(f"Testing Accuracy: {test_score}")
    if train_score > test_score + 0.1:
        print("Warning: Model may be overfitting.")
    elif train_score < test_score - 0.1:
        print("Warning: Model may be underfitting.")
    else:
        print("Model is generalizing well.")

# Deploy the best model
def deploy_model(model, file_path):
    joblib.dump(model, file_path)
    print(f"Model deployed to {file_path}")

# Main function
def main(file_path, target_col):
    # Load data
    data = load_data(file_path)

    # Preprocess data
    data = preprocess_data(data)

    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(data, target_col)

    # Determine if the problem is regression or classification
    if data[target_col].nunique() > 10:  # Regression
        print("Regression problem detected.")
        results = evaluate_regression_models(X_train, X_val, y_train, y_val)
        best_model_name = min(results, key=results.get)
        print(f"Best model: {best_model_name} with MSE: {results[best_model_name]}")
    else:  # Classification
        print("Classification problem detected.")
        results = evaluate_classification_models(X_train, X_val, y_train, y_val)
        best_model_name = max(results, key=lambda x: results[x]['Accuracy'])
        print(f"Best model: {best_model_name} with Accuracy: {results[best_model_name]['Accuracy']}")

    # Train the best model on the full training set
    if data[target_col].nunique() > 10:  # Regression
        best_model = LinearRegression() if best_model_name == "Linear Regression" else \
                     DecisionTreeRegressor() if best_model_name == "Decision Tree" else \
                     RandomForestRegressor() if best_model_name == "Random Forest" else \
                     GradientBoostingRegressor() if best_model_name == "Gradient Boosting" else \
                     SVR() if best_model_name == "SVR" else \
                     KNeighborsRegressor()
    else:  # Classification
        best_model = LogisticRegression() if best_model_name == "Logistic Regression" else \
                     DecisionTreeClassifier() if best_model_name == "Decision Tree" else \
                     RandomForestClassifier() if best_model_name == "Random Forest" else \
                     GradientBoostingClassifier() if best_model_name == "Gradient Boosting" else \
                     SVC() if best_model_name == "SVC" else \
                     KNeighborsClassifier()

    best_model.fit(X_train, y_train)

    # Check for overfitting or underfitting
    check_overfitting(best_model, X_train, X_test, y_train, y_test)

    # Deploy the best model
    deploy_model(best_model, "best_model.pkl")

# Run the script
if __name__ == "__main__":
    file_path = input("please upload the dataset:" )  # Replace with your dataset file path
    target_col = input("target_column:" )    # Replace with your target column name
    main(file_path, target_col)