In [43]:
# Import necessary modules
import data_preprocessor as dp
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 1. Load the dataset
messy_data = pd.read_csv("C:/Users/rabiy/BINF-5507-Materials-ASSIGNMENT-1/Data/messy_data.csv")
clean_data = messy_data.copy()


# 2. Preprocess the data
# a .  Impute missing values
def impute_missing_values(data, strategy='mean'):
    """Fill missing values in the dataset."""
    if strategy == 'mean':
        data = data.fillna(data.mean(numeric_only=True))
    elif strategy == 'median':
        data = data.fillna(data.median(numeric_only=True))
    elif strategy == 'mode':
        data = data.fillna(data.mode().iloc[0])
    return data

# b. Remove duplicates
def remove_duplicates(data):
    """Remove duplicate rows from the dataset."""
    data = data.drop_duplicates()
    return data

# c. Normalize numerical features
def normalize_data(data, method='minmax'):
    """Apply normalization to numerical features."""
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    return data

# d. Remove highly correlated features
def remove_redundant_features(data, threshold=0.9):
    """
    Remove highly correlated numeric features.
    
    :param data: pandas DataFrame
    :param threshold: float, correlation threshold
    :return: pandas DataFrame with redundant features removed
    """
    data_copy = data.copy()
    numeric_data = data_copy.select_dtypes(include=[np.number])  # Only numeric columns

    corr_matrix = numeric_data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    return data_copy.drop(columns=to_drop, errors='ignore')  # Drop from original data, not just numeric

# 3. Save the cleaned dataset
clean_data.to_csv('../Data/clean_data.csv', index=False)


# 4. Train and evaluate the model
dp.simple_model(clean_data)
def simple_model(input_data, split_data=True, scale_data=False, print_report=False):
    """
    A simple logistic regression model for target classification.
    Parameters:
    input_data (pd.DataFrame): The input data containing features and the target variable 'target' (assume 'target' is the first column).
    split_data (bool): Whether to split the data into training and testing sets. Default is True.
    scale_data (bool): Whether to scale the features using StandardScaler. Default is False.
    print_report (bool): Whether to print the classification report. Default is False.
    Returns:
    None
    The function performs the following steps:
    1. Removes columns with missing data.
    2. Splits the input data into features and target.
    3. Encodes categorical features using one-hot encoding.
    4. Splits the data into training and testing sets (if split_data is True).
    5. Scales the features using StandardScaler (if scale_data is True).
    6. Instantiates and fits a logistic regression model.
    7. Makes predictions on the test set.
    8. Evaluates the model using accuracy score and classification report.
    9. Prints the accuracy and classification report (if print_report is True).
    """

# if there's any missing data, remove the columns
clean_data.dropna(inplace=True)
target = clean_data[clean_data.columns[0]]
print("Target dtype:", target.dtype)
print("Unique values:", target.unique())

# If target is continuous but classification is intended, discretize or encode:
# Example for binary classification:
if target.dtype != 'object' and len(target.unique()) > 10:
    print("Target looks continuous; consider converting to categorical labels.")


    # split the data into features and target
    target = input_data.copy()[input_data.columns[0]]
    features = input_data.copy()[input_data.columns[1:]]

    # if the column is not numeric, encode it (one-hot)
    for col in features.columns:
        if features[col].dtype == 'object':
            features = pd.concat([features, pd.get_dummies(features[col], prefix=col)], axis=1)
            features.drop(col, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify=target, random_state=42)

    if scale_data:
        # scale the data
        X_train = normalize_data(X_train)
        X_test = normalize_data(X_test)
        
    # instantiate and fit the model
    log_reg = LogisticRegression(random_state=42, max_iter=100, solver='liblinear', penalty='l2', C=1.0)
    log_reg.fit(X_train, y_train)

    # make predictions and evaluate the model
    y_pred = log_reg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

Accuracy: 0.8
Target dtype: float64
Unique values: [0. 1.]


In [None]:
# Import necessary modules
import data_preprocessor as dp
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 1. Load the dataset
messy_data = pd.read_csv("C:/Users/rabiy/BINF-5507-Materials-ASSIGNMENT-1/Data/messy_data.csv")
clean_data = messy_data.copy()


# 2. Preprocess the data
def impute_missing_values(data, strategy='mean'):
    """Fill missing values in the dataset."""
    if strategy == 'mean':
        data = data.fillna(data.mean(numeric_only=True))
    elif strategy == 'median':
        data = data.fillna(data.median(numeric_only=True))
    elif strategy == 'mode':
        data = data.fillna(data.mode().iloc[0])
    return data

def remove_duplicates(data):
    """Remove duplicate rows from the dataset."""
    data = data.drop_duplicates()
    return data

def normalize_data(data, method='minmax'):
    """Apply normalization to numerical features."""
    numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
    if method == 'minmax':
        scaler = MinMaxScaler()
    elif method == 'standard':
        scaler = StandardScaler()
    data[numerical_features] = scaler.fit_transform(data[numerical_features])
    return data

def remove_redundant_features(data, threshold=0.9):
    """
    Remove highly correlated numeric features.
    
    :param data: pandas DataFrame
    :param threshold: float, correlation threshold
    :return: pandas DataFrame with redundant features removed
    """
    data_copy = data.copy()
    numeric_data = data_copy.select_dtypes(include=[np.number])  # Only numeric columns

    corr_matrix = numeric_data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

    return data_copy.drop(columns=to_drop, errors='ignore')  # Drop from original data, not just numeric

# 3. Save the cleaned dataset
clean_data.to_csv('../Data/clean_data.csv', index=False)


# 4. Train and evaluate the model
dp.simple_model(clean_data)
def simple_model(input_data, split_data=True, scale_data=False, print_report=False):
    """
    A simple logistic regression model for target classification.
    Parameters:
    input_data (pd.DataFrame): The input data containing features and the target variable 'target' (assume 'target' is the first column).
    split_data (bool): Whether to split the data into training and testing sets. Default is True.
    scale_data (bool): Whether to scale the features using StandardScaler. Default is False.
    print_report (bool): Whether to print the classification report. Default is False.
    Returns:
    None
    The function performs the following steps:
    1. Removes columns with missing data.
    2. Splits the input data into features and target.
    3. Encodes categorical features using one-hot encoding.
    4. Splits the data into training and testing sets (if split_data is True).
    5. Scales the features using StandardScaler (if scale_data is True).
    6. Instantiates and fits a logistic regression model.
    7. Makes predictions on the test set.
    8. Evaluates the model using accuracy score and classification report.
    9. Prints the accuracy and classification report (if print_report is True).
    """

# if there's any missing data, remove the columns
clean_data.dropna(inplace=True)
target = clean_data[clean_data.columns[0]]
print("Target dtype:", target.dtype)
print("Unique values:", target.unique())

# If target is continuous but classification is intended, discretize or encode:
# Example for binary classification:
if target.dtype != 'object' and len(target.unique()) > 10:
    print("Target looks continuous; consider converting to categorical labels.")


    # split the data into features and target
    target = input_data.copy()[input_data.columns[0]]
    features = input_data.copy()[input_data.columns[1:]]

    # if the column is not numeric, encode it (one-hot)
    for col in features.columns:
        if features[col].dtype == 'object':
            features = pd.concat([features, pd.get_dummies(features[col], prefix=col)], axis=1)
            features.drop(col, axis=1, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, stratify=target, random_state=42)

    if scale_data:
        # scale the data
        X_train = normalize_data(X_train)
        X_test = normalize_data(X_test)
        
    # instantiate and fit the model
    log_reg = LogisticRegression(random_state=42, max_iter=100, solver='liblinear', penalty='l2', C=1.0)
    log_reg.fit(X_train, y_train)

    # make predictions and evaluate the model
    y_pred = log_reg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f'Accuracy: {accuracy}')

Accuracy: 0.8
Target dtype: float64
Unique values: [0. 1.]
