In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
import category_encoders as ce

# Function to load dataset dynamically
def load_dataset(dataset_name):
    dataset_paths = {
        "Bank_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\Bank_churn.csv",
        "BankChurners": r"C:\Users\Administrator\Desktop\charun\churn_data\BankChurners.csv",
        "churn_bigml_80": r"C:\Users\Administrator\Desktop\charun\churn_data\churn-bigml-80.csv",
        "Customertravel": r"C:\Users\Administrator\Desktop\charun\churn_data\Customertravel.csv",
        "E_Commerce": r"C:\Users\Administrator\Desktop\charun\churn_data\E Commerce Dataset.csv",
        "ecom_user_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\ecom-user-churn-data.csv",
        "internet_service_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\internet_service_churn.csv",
        "orange_telecom": r"C:\Users\Administrator\Desktop\charun\churn_data\orange_telecom.csv",
        "subscription_service_train": r"C:\Users\Administrator\Desktop\charun\churn_data\subscription_service_train.csv",
        "Telco_Customer_Churn": r"C:\Users\Administrator\Desktop\charun\churn_data\Telco-Customer-Churn.csv",
        "telecom_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\telecom_churn.csv"
    }
    return pd.read_csv(dataset_paths[dataset_name])

def handle_missing_values(df):
    print("Handling missing values...")
    # Handle numeric missing values
    numeric_cols = df.select_dtypes(include=['number']).columns
    if not numeric_cols.empty:
        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # Handle non-numeric missing values
    non_numeric_cols = df.select_dtypes(exclude=['number']).columns
    if not non_numeric_cols.empty:
        mode_values = df[non_numeric_cols].mode()
        if not mode_values.empty:
            df[non_numeric_cols] = df[non_numeric_cols].fillna(mode_values.iloc[0])
    
    # Drop columns with more than 50% missing values
    missing_percentage = df.isna().mean()
    cols_to_drop = missing_percentage[missing_percentage > 0.5].index
    df = df.drop(columns=cols_to_drop)
    
    return df

def encode_categorical_features(df):
    print("Encoding categorical features...")
    df = pd.get_dummies(df, drop_first=True)
    return df

def scale_numeric_features(df):
    print("Scaling numeric features...")
    numeric_cols = df.select_dtypes(include=['number']).columns
    if not numeric_cols.empty:
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    return df

def bin_numeric_features(df):
    print("Binning numeric features...")
    # Define binning function
    def bin_column(column, bins, labels):
        if column is not None and not column.empty:
            return pd.cut(column, bins=bins, labels=labels)
        return column

    # Ensure 'daymins' is numeric and handle NaNs
    if 'daymins' in df.columns:
        df['daymins'] = pd.to_numeric(df['daymins'], errors='coerce')
        df = df.dropna(subset=['daymins'])
        if not df['daymins'].empty:
            df['daymins_group'] = bin_column(df['daymins'], bins=[0, 100, 200, 300, 400], labels=['0-100', '100-200', '200-300', '300+'])

    # Ensure 'monthlycharge' is numeric and handle NaNs
    if 'monthlycharge' in df.columns:
        df['monthlycharge'] = pd.to_numeric(df['monthlycharge'], errors='coerce')
        df = df.dropna(subset=['monthlycharge'])
        if not df['monthlycharge'].empty:
            df['monthlycharge_group'] = bin_column(df['monthlycharge'], bins=[0, 30, 70, 120], labels=['low', 'medium', 'high'])

    return df

def create_feature_interactions(df):
    print("Creating feature interactions...")
    interaction_cols = ['daymins', 'monthlycharge']  # Example columns

    # Print the columns available in the DataFrame for debugging
    print("Columns available for interaction features:")
    print(df.columns.tolist())

    # Ensure all interaction columns are numeric
    for col in interaction_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            print(f"Warning: Column '{col}' not found in DataFrame.")
    
    # Drop rows with NaNs in the interaction columns
    df = df.dropna(subset=interaction_cols)

    # Print the data types and missing values for debugging
    print("Data types before interaction features:")
    print(df.dtypes)
    print("Missing values before interaction features:")
    print(df[interaction_cols].isna().sum())

    # Create interaction features if columns are available
    if all(col in df.columns for col in interaction_cols):
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        interaction_features = poly.fit_transform(df[interaction_cols])
        interaction_df = pd.DataFrame(interaction_features, columns=poly.get_feature_names_out(interaction_cols))
        df = pd.concat([df, interaction_df], axis=1)
    
    return df

def feature_engineering_pipeline(df, dataset_type):
    df = handle_missing_values(df)
    df = encode_categorical_features(df)
    df = scale_numeric_features(df)
    df = create_feature_interactions(df)
    df = bin_numeric_features(df)
    return df

def main(dataset_name, dataset_type):
    print(f"Loading dataset: {dataset_name}")
    df = load_dataset(dataset_name)

    print(f"Performing EDA on {dataset_name}...")
    # eda(df)  # Uncomment if you have an EDA function

    print("Applying feature engineering...")
    df = feature_engineering_pipeline(df, dataset_type)

    print("Feature engineering complete. Here's a preview of the processed data:")
    print(df.head())

    return df

# List of datasets to process
datasets = [
    "Bank_churn", "BankChurners", "churn_bigml_80", "Customertravel", 
    "E_Commerce", "ecom_user_churn", "internet_service_churn", 
    "orange_telecom", "subscription_service_train", 
    "Telco_Customer_Churn", "telecom_churn"
]

# Process each dataset
for dataset_name in datasets:
    print(f"Processing {dataset_name}...")
    processed_df = main(dataset_name, dataset_type="generic")  # Adjust dataset_type if needed
    print(f"Processing complete for {dataset_name}\n")


Processing Bank_churn...
Loading dataset: Bank_churn
Performing EDA on Bank_churn...
Applying feature engineering...
Handling missing values...
Encoding categorical features...
Scaling numeric features...
Creating feature interactions...
Columns available for interaction features:
['rownumber', 'customerid', 'creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary', 'churn', 'surname_Abbie', 'surname_Abbott', 'surname_Abdullah', 'surname_Abdulov', 'surname_Abel', 'surname_Abernathy', 'surname_Abramov', 'surname_Abramova', 'surname_Abramovich', 'surname_Abramowitz', 'surname_Abrego', 'surname_Abron', 'surname_Achebe', 'surname_Adams', 'surname_Adamson', 'surname_Afamefula', 'surname_Afamefuna', 'surname_Afanasyev', 'surname_Afanasyeva', 'surname_Agafonova', 'surname_Aguirre', 'surname_Ah Mouy', 'surname_Ahern', 'surname_Ahmed', 'surname_Aiken', 'surname_Aikenhead', 'surname_Ainsworth', 'surname_Aitken', 'surname_Ajuluchukwu', 'surname_A

KeyError: ['daymins', 'monthlycharge']

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

def load_and_preprocess_data(dataset_name, dataset_path):
    # Load the dataset
    data = pd.read_csv(dataset_path)
    
    # Display the first few rows and columns of the dataset
    print(f"Processing dataset: {dataset_name}")
    print(data.head())
    print("Columns in the dataset:")
    print(data.columns.tolist())

    # Define target and feature columns based on dataset
    if dataset_name == "Bank_churn":
        target_col = 'churn'
        feature_cols = data.columns.difference([target_col, 'rownumber', 'customerid', 'surname'])
        numeric_features = ['creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'estimatedsalary']
        categorical_features = ['geography', 'gender']
        
    elif dataset_name == "BankChurners":
        target_col = 'churn'
        feature_cols = data.columns.difference([target_col, 'clientnum'])
        numeric_features = [
            'customer_age', 'dependent_count', 'months_on_book', 'total_relationship_count',
            'months_inactive_12_mon', 'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
            'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt', 'total_trans_ct',
            'total_ct_chng_q4_q1', 'avg_utilization_ratio'
        ]
        categorical_features = [
            'gender', 'education_level', 'marital_status', 'income_category', 'card_category'
        ]

    elif dataset_name == "churn_bigml_80":
        target_col = 'churn'
        feature_cols = data.columns.difference([target_col])
        numeric_features = [
            'account length', 'total day minutes', 'total day calls', 'total day charge',
            'total eve minutes', 'total eve calls', 'total eve charge', 'total night minutes',
            'total night calls', 'total night charge', 'total intl minutes', 'total intl calls',
            'total intl charge', 'customer service calls'
        ]
        categorical_features = [
            'state', 'international plan', 'voice mail plan'
        ]

    elif dataset_name == "Customertravel":
        target_col = 'churn'
        feature_cols = data.columns.difference([target_col])
        numeric_features = []  # No numeric features in this dataset
        categorical_features = [
            'age', 'frequentflyer', 'annualincomeclass', 'servicesopted',
            'accountsyncedtosocialmedia', 'bookedhotelornot'
        ]

    elif dataset_name == "E_Commerce":
        target_col = 'churn'
        feature_cols = data.columns.difference([target_col, 'customerid'])
        numeric_features = [
            'tenure', 'hourspendonapp', 'numberofdeviceregistered', 'satisfactionscore',
            'numberofaddress', 'orderamounthikefromlastyear', 'ordercount', 'daysincelastorder',
            'cashbackamount'
        ]
        categorical_features = [
            'preferredlogindevice', 'citytier', 'warehousetohome', 'preferredpaymentmode',
            'gender', 'preferedordercat', 'maritalstatus', 'complain', 'couponused'
        ]

    else:
        raise ValueError("Unknown dataset name")

    # Check for missing columns in the dataset
    missing_numeric_features = [col for col in numeric_features if col not in data.columns]
    missing_categorical_features = [col for col in categorical_features if col not in data.columns]
    
    if missing_numeric_features:
        print(f"Warning: Missing numeric features: {missing_numeric_features}")
    if missing_categorical_features:
        print(f"Warning: Missing categorical features: {missing_categorical_features}")

    # Adjust feature columns based on missing columns
    numeric_features = [col for col in numeric_features if col in data.columns]
    categorical_features = [col for col in categorical_features if col in data.columns]

    # Split features and target
    X = data[feature_cols]
    y = data[target_col]

    # Define preprocessing pipelines
    numeric_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
        ('scaler', StandardScaler())  # Standardize features
    ]) if numeric_features else None

    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with most frequent value
        ('onehot', OneHotEncoder(drop='first'))  # One hot encode categorical features
    ])

    # Combine preprocessing steps
    transformers = []
    if numeric_pipeline:
        transformers.append(('num', numeric_pipeline, numeric_features))
    if categorical_pipeline:
        transformers.append(('cat', categorical_pipeline, categorical_features))

    preprocessor = ColumnTransformer(
        transformers=transformers
    )

    # Preprocessing and feature engineering
    X_processed = preprocessor.fit_transform(X)

    # Feature interactions
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_interactions = poly.fit_transform(X_processed)

    # Split the dataset into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_interactions, y, test_size=0.2, random_state=42)

    # Display the shapes of the resulting datasets
    print(f"Dataset: {dataset_name}")
    print("Training data shape:", X_train.shape)
    print("Testing data shape:", X_test.shape)
    print("Training target shape:", y_train.shape)
    print("Testing target shape:", y_test.shape)
    
    return X_train, X_test, y_train, y_test

# Paths to the datasets
dataset_paths = {
    "Bank_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\Bank_churn.csv",
    "BankChurners": r"C:\Users\Administrator\Desktop\charun\churn_data\BankChurners.csv",
    "churn_bigml_80": r"C:\Users\Administrator\Desktop\charun\churn_data\churn-bigml-80.csv",
    "Customertravel": r"C:\Users\Administrator\Desktop\charun\churn_data\Customertravel.csv",
    "E_Commerce": r"C:\Users\Administrator\Desktop\charun\churn_data\E Commerce Dataset.csv"
}

# Process each dataset
for dataset_name, dataset_path in dataset_paths.items():
    X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset_name, dataset_path)
    print(f"Processing complete for {dataset_name}\n")


Processing dataset: Bank_churn
   rownumber  customerid   surname  creditscore geography  gender  age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   tenure    balance  numofproducts  hascrcard  isactivemember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   estimatedsalary  churn  
0        101348.88      1  
1        112542.58      0  
2        113931.57      1  
3         93826

In [38]:
import pandas as pd
from sklearn.impute import KNNImputer

# Load the dataset
df_telco = pd.read_csv('churn_data/Telco-Customer-Churn.csv')

# Separate numeric and categorical columns
numeric_cols = df_telco.select_dtypes(include=['number']).columns
categorical_cols = df_telco.select_dtypes(include=['object']).columns

# Extract numeric and categorical data
df_numeric = df_telco[numeric_cols]
df_categorical = df_telco[categorical_cols]

# Impute missing values in categorical data with mode
df_categorical.fillna(df_categorical.mode().iloc[0], inplace=True)

# Initialize and apply KNNImputer to numeric data
imputer = KNNImputer(n_neighbors=5)
df_numeric_imputed = imputer.fit_transform(df_numeric)

# Convert the imputed numeric data back to a DataFrame
df_numeric_imputed = pd.DataFrame(df_numeric_imputed, columns=numeric_cols)

# Combine imputed numeric data with imputed categorical data
df_telco_imputed = pd.concat([df_numeric_imputed, df_categorical.reset_index(drop=True)], axis=1)

# Convert object columns to appropriate data types
df_telco_imputed['totalcharges'] = pd.to_numeric(df_telco_imputed['totalcharges'], errors='coerce')
df_telco_imputed['totalcharges'].fillna(df_telco_imputed['totalcharges'].median(), inplace=True)

# Encode categorical columns
for col in df_telco_imputed.select_dtypes(include=['object']).columns:
    df_telco_imputed[col] = df_telco_imputed[col].astype('category').cat.codes

# Check the result
print(df_telco_imputed.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   seniorcitizen     7043 non-null   float64
 1   tenure            7043 non-null   float64
 2   monthlycharges    7043 non-null   float64
 3   churn             7043 non-null   float64
 4   customerid        7043 non-null   int16  
 5   gender            7043 non-null   int8   
 6   partner           7043 non-null   int8   
 7   dependents        7043 non-null   int8   
 8   phoneservice      7043 non-null   int8   
 9   multiplelines     7043 non-null   int8   
 10  internetservice   7043 non-null   int8   
 11  onlinesecurity    7043 non-null   int8   
 12  onlinebackup      7043 non-null   int8   
 13  deviceprotection  7043 non-null   int8   
 14  techsupport       7043 non-null   int8   
 15  streamingtv       7043 non-null   int8   
 16  streamingmovies   7043 non-null   int8   


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical.fillna(df_categorical.mode().iloc[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_telco_imputed['totalcharges'].fillna(df_telco_imputed['totalcharges'].median(), inplace=True)


In [35]:
# Show the first few rows
print(df_telco.head())

# Get information about the data
print(df_telco.info())

# Summary statistics
print(df_telco.describe(include='all'))


   customerid  gender  seniorcitizen partner dependents  tenure phoneservice  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      multiplelines internetservice onlinesecurity  ... deviceprotection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  techsupport streamingtv streamingmovies        contract pape

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

def train_random_forest(X_train, y_train):
    # Initialize the RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    return model

def evaluate_model(model, X_test, y_test):
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print("Model Evaluation:")
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(report)
    
    return accuracy, report

if __name__ == "__main__":
    from data_utils import load_and_preprocess_data
    
    # Define dataset paths
    dataset_paths = {
        "Bank_churn": r"C:\Users\Administrator\Desktop\charun\churn_data\Bank_churn.csv",
        "BankChurners": r"C:\Users\Administrator\Desktop\charun\churn_data\BankChurners.csv",
        "churn_bigml_80": r"C:\Users\Administrator\Desktop\charun\churn_data\churn-bigml-80.csv",
        "Customertravel": r"C:\Users\Administrator\Desktop\charun\churn_data\Customertravel.csv",
        "E_Commerce": r"C:\Users\Administrator\Desktop\charun\churn_data\E Commerce Dataset.csv"
    }
    
    # Process and train models for each dataset
    for dataset_name, dataset_path in dataset_paths.items():
        X_train, X_test, y_train, y_test = load_and_preprocess_data(dataset_name, dataset_path)
        
        # Train the model
        model = train_random_forest(X_train, y_train)
        
        # Evaluate the model
        evaluate_model(model, X_test, y_test)


ModuleNotFoundError: No module named 'data_utils'