In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier  # Replace with your desired classifier

def load_data(file_path):
    return pd.read_csv(file_path)

def separate_features_target(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def identify_features(X):
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    return numerical_features, categorical_features

def create_preprocessor():
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

def create_classifier():
    return RandomForestClassifier()  # Replace with your desired classifier

def create_pipeline(preprocessor, classifier):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

def train_pipeline(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

def evaluate_pipeline(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy}')

# Load CSV data into a DataFrame
file_path = 'your_data.csv'
df = load_data(file_path)

# Separate features and target variable
X, y = separate_features_target(df, 'target_variable')

# Identify numerical and categorical features
numerical_features, categorical_features = identify_features(X)

# Create preprocessor and classifier
preprocessor = create_preprocessor()
classifier = create_classifier()

# Create the pipeline
pipeline = create_pipeline(preprocessor, classifier)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline on the training data
train_pipeline(pipeline, X_train, y_train)

# Evaluate the model on the testing data
evaluate_pipeline(pipeline, X_test, y_test)


In [210]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import joblib
import numpy as np
from ucimlrepo import fetch_ucirepo

  

def load_data(file_path):
    return pd.read_csv(file_path)

def fetch_and_merge_dataset(dataset_id):
    # Fetch dataset
    dataset = fetch_ucirepo(id=dataset_id)
    
    # Extract features and targets
    X = dataset.data.features
    y = dataset.data.targets
    
    # Merge X and y into a single DataFrame
    df = pd.concat([X, pd.DataFrame({'target': y})], axis=1)
    
    return df

def separate_features_target(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def identify_features(X):
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    return numerical_features, categorical_features

def handle_duplicates(df):
    df_no_duplicates = df.drop_duplicates()
    return df_no_duplicates

def create_preprocessor():
    numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

def create_classifier():
    return RandomForestClassifier()

def create_pipeline(preprocessor, classifier):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

def train_pipeline(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

def evaluate_pipeline(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy}')

def save_pipeline(pipeline, file_path):
    joblib.dump(pipeline, file_path)



def load_pipeline(file_path):
    return joblib.load(file_path)

def predict_with_pipeline(pipeline, new_data):
    return pipeline.predict(new_data)


In [184]:
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
aids = fetch_ucirepo(id=890) 
# data (as pandas dataframes) 
X_ = aids.data.features 
y_ = aids.data.targets 
# Merge X and y into a single DataFrame
df = pd.concat([X_, y_], axis=1)
df = handle_duplicates(df)
# Separate features and target variable
X, y = separate_features_target(df, 'cid')
# Identify numerical and categorical features
numerical_features, categorical_features = identify_features(X)
# Create preprocessor and classifier
preprocessor = create_preprocessor()
classifier = create_classifier()
# Create the pipeline
pipeline = create_pipeline(preprocessor, classifier)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train the pipeline on the training data
train_pipeline(pipeline, X_train, y_train)
# Evaluate the model on the testing data
evaluate_pipeline(pipeline, X_test, y_test)
# Save the trained pipeline to a file
save_pipeline(pipeline, 'trained_pipeline.joblib')

Model Accuracy: 0.8785046728971962


In [189]:
def train_and_evaluate_model(data_source, target_column, save_model_filename):
    # Function to train and evaluate the model given a data source
    # Fetch dataset
    dataset= fetch_ucirepo(id=data_source) 
    
    X_ = dataset.data.features 
    y_ = dataset.data.targets 
    # Merge X and y into a single DataFrame
    df = pd.concat([X_, y_], axis=1)
    df = handle_duplicates(df)
    
    # Separate features and target variable
    X, y = separate_features_target(df, target_column)
    
    # Identify numerical and categorical features
    numerical_features, categorical_features = identify_features(X)
    
    # Create preprocessor and classifier
    preprocessor = create_preprocessor()
    classifier = RandomForestClassifier()
    # Create the pipeline
    pipeline = create_pipeline(preprocessor, classifier)
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Train the pipeline on the training data
    train_pipeline(pipeline, X_train, y_train)
    # Evaluate the model on the testing data
    accuracy = evaluate_pipeline(pipeline, X_test, y_test)
    # Save the trained pipeline to a file
    save_pipeline(pipeline, save_model_filename)
    
    return accuracy

Model Accuracy: 0.8878504672897196


In [220]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib
import numpy as np
from ucimlrepo import fetch_ucirepo
import matplotlib.pyplot as plt

def separate_features_target(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    return X, y

def identify_features(X):
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns
    return numerical_features, categorical_features

def handle_duplicates(df):
    df_no_duplicates = df.drop_duplicates()
    return df_no_duplicates

def create_preprocessor():
    numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    return preprocessor

def create_classifier():
    return RandomForestClassifier()

def create_pipeline(preprocessor, classifier):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier)
    ])

def train_pipeline(pipeline, X_train, y_train):
    pipeline.fit(X_train, y_train)

def evaluate_pipeline(pipeline, X_test, y_test):
    accuracy = pipeline.score(X_test, y_test)
    print(f'Model Accuracy: {accuracy}')

def save_pipeline(pipeline, file_path):
    joblib.dump(pipeline, file_path)

def load_pipeline(file_path):
    return joblib.load(file_path)

def predict_with_pipeline(pipeline, new_data):
    return pipeline.predict(new_data)

In [221]:
def load(id, output_filename):
    # Fetch dataset
    dataset = fetch_ucirepo(id=id) 
    # Extract features and targets
    X_ = dataset.data.features 
    y_ = dataset.data.targets 
    # Merge X and y into a single DataFrame
    df = pd.concat([X_, y_], axis=1)
    # Save the DataFrame to a CSV file
    df.to_csv(output_filename, index=False)


def train_and_evaluate_model(file_path, target_column, save_model_filename, classifier):
    df = pd.read_csv(file_path)
    df = handle_duplicates(df)
    # Separate features and target variable
    X, y = separate_features_target(df, target_column)
    # Identify numerical and categorical features
    numerical_features, categorical_features = identify_features(X)
    # Create preprocessor
    preprocessor = create_preprocessor()
    # Use the specified classifier or default to RandomForestClassifier
    if classifier is None:
        classifier = RandomForestClassifier()
    # Create the pipeline
    pipeline = create_pipeline(preprocessor, classifier)
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Train the pipeline on the training data
    train_pipeline(pipeline, X_train, y_train)
    # Evaluate the model on the testing data
    accuracy = evaluate_pipeline(pipeline, X_test, y_test)
    # Save the trained pipeline to a file
    save_pipeline(pipeline, save_model_filename) 
    return accuracy


def predict_save_csv(model_filename, new_data_filename, output_filename):
    # Load the pre-trained pipeline
    loaded_pipeline = load_pipeline(model_filename)
    # Read the new data
    new_data = pd.read_csv(new_data_filename)
    # Predict with the loaded pipeline
    predictions = predict_with_pipeline(loaded_pipeline, new_data)
    predictions_df = pd.DataFrame(predictions, columns=['Predicted_Label'])
    # Save predictions to a CSV file
    predictions_df.to_csv(output_filename, index=False)
    print("prediction is done")

In [222]:
id_to_fetch = 890
output_csv_filename = 'train_data.csv'
load(id_to_fetch, output_csv_filename)

In [223]:
file_path = './train_data.csv'
target_column = 'cid'
save_model_filename = 'trained_pipeline.joblib'
train_and_evaluate_model(file_path, target_column, save_model_filename, classifier=RandomForestClassifier())

Model Accuracy: 0.8808411214953271


'{classifier} model accuracy {accuracy}'

In [209]:
model_filename = 'trained_pipeline.joblib'
new_data_filename = 'test_data.csv'
output_filename = 'predictions.csv'
predictions = predict_save_csv(model_filename, new_data_filename, output_filename)
print(f'Predictions on new data saved to {output_filename}')

prediction is done
Predictions on new data saved to predictions.csv
