In [40]:
import pandas as pd
import os
import chardet

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read(100000))
    return result['encoding']

def process_csv_data(input_file, output_directory):
    # Detect file encoding
    try:
        encoding = detect_encoding(input_file)
        # Read the CSV file with detected encoding and show available columns
        df = pd.read_csv(input_file, encoding=encoding, low_memory=False)
        print("Available columns in input file:")
        print(df.columns.tolist())
    except Exception as e:
        # Try common encodings if automatic detection fails
        for encoding in ['ISO-8859-1', 'cp1252', 'latin1']:
            try:
                df = pd.read_csv(input_file, encoding=encoding, low_memory=False)
                print(f"Successfully read with {encoding} encoding")
                print("Available columns:")
                print(df.columns.tolist())
                break
            except:
                continue
        else:
            raise ValueError(f"Could not read file. Error: {str(e)}")

    # Standardize column names (handle case differences and spaces)
    df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
    
    # Column name mapping (original in CSV : name we want to use)
    column_mapping = {
        'Process Duration(in min)': 'Process Duration(in min)',
        'Prop Ip Wt': 'Prop Ip Wt',
        'Ip Width': 'Ip Width',
        'Total Length': 'Total Length',
        'Total Zn/AlZn Coating': 'Total Zn/AlZn Coating',
        'Actual Product': 'Actual Product',
        'Op Width': 'Op Width',
        'Op Thk':'Op Thk',
        'O/P Wt': 'O/P Wt'
    }
    
    # Check for required columns
    missing_columns = [col for col in column_mapping.values() if col not in df.columns]
    if missing_columns:
        available_cols = [col for col in df.columns if not col.startswith('Unnamed')]
        raise ValueError(f"Missing required columns: {missing_columns}\n"
                       f"Available columns: {available_cols}")

    # 1. Calculate Area and Zinc (NEW CALCULATIONS)
    df['Area'] = (df['Op Width'] * df['Total Length']) / 1000
    df['Zinc'] = df['Total Zn/AlZn Coating'] * df['Area']
    
    # 2. Round Process Duration
    df['Process Duration(in min)'] = df['Process Duration(in min)'].round(0)
    
    # 3. Calculate CRFH thickness
    df['CRFH thickness'] = round(
        (df['Prop Ip Wt'] * 1000) / 
        (7.850 * (df['Op Width'] * df['Total Length']) / 1000), 
        3
    )
    
    # 4. Calculate GP thickness
    coating_density = df['Actual Product'].map({
        'GI': 7140,
        'GL': 3750,
        'PPGL': 3750,
        'ZM': 6850
    })
    df['GP thickness'] = round(
        (df['Total Zn/AlZn Coating'] / coating_density) + df['CRFH thickness'], 
        4
    )
    
    # 5. Calculate Speed
    df['speed'] = round(
        (df['Total Length'] / df['Process Duration(in min)']), 
        3
    )
    
    # 6. Calculate Productivity
    df['productivity'] = round(
        (df['O/P Wt'] / df['Process Duration(in min)']) * 60, 
        3
    )
    
    # Select output columns
    output_columns = [
        'Actual Product', 'Actual Tdc', 'Segment', 'Prop Ip Wt', 
        'O/P Wt', 'Total Length', 'Area', 'Zinc', 
        'Process Duration(in min)', 'CRFH thickness', 'GP thickness', 
        'Total Zn/AlZn Coating', 'Op Width','Op Thk', 'speed', 'productivity'
    ]
    
    # Filter for available columns only
    final_columns = [col for col in output_columns if col in df.columns]
    final_df = df[final_columns]
    
    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)
    
    # Create output file path
    output_file = os.path.join(output_directory, 'output.csv')
    
    # Save to CSV
    final_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"\nProcessing complete. Output saved to: {output_file}")
    print("\nFirst 5 rows of processed data:")
    print(final_df.head())
    
    return final_df

# Usage example:
try:
    input_path = 'C:/Users/Dell/Desktop/All_proj/Dj/jsw project1/25datacsv.csv'
    output_dir = r'C:\Users\Dell\Desktop\new dp\DP\front'
    processed_data = process_csv_data(input_path, output_dir)
except Exception as e:
    print(f"Error processing file: {str(e)}")

Successfully read with ISO-8859-1 encoding
Available columns:
['Row', 'Actual Product', 'Segment', 'DP FLAG', 'MotherBatchNo', 'Ip Width', 'Ip Thick', 'Mother Ip Wt', 'Order Tdc', 'Op Batch No', 'Actual Tdc', 'Op Thk', 'Op Width', 'Prop Ip Wt', 'O/P Wt', 'Total Length', 'Target coating weight', 'ZN/AlZn Coating Top', 'ZN/AlZn Coating Bot', 'Total Zn/AlZn Coating', 'Spangle Type', 'Tlv Usage', 'Tlv Elongation', 'SPM Usage', 'SPM Elongation', 'Entry Baby Wt', 'Entry End Cut', 'Exit Baby Wt', 'Exit End Cut', 'Trim Loss', 'Total Scrap', 'Surface Finish', 'Passivation_Type', 'Passivation Flag', 'Logo', 'Liner Marking', 'Ip Idm', 'Ip Odm', 'Cr grade', 'Zn theo weight', 'Sleeve', 'L2 Remarks', 'Next Unit', 'Status', 'Material Yield(%) with Zinc', 'Material Yield(%) without Zinc', 'Start Date', 'Start Time', 'End Date', 'End Time', 'Shift', 'Process Duration(in min)', 'Pdo Time', 'Age(Days)', 'PlanThickness', 'PlanWidth', 'Target Thick', 'Target Width', 'Anneal Code', 'No Of Samples', 'Oil Usa

In [62]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib
import os

# Load the preprocessed data
output_path = r'C:\Users\Dell\Desktop\new dp\DP\front\output.csv'
df = pd.read_csv(output_path)

# Additional preprocessing
def preprocess_data(df):
    # Encode categorical variables
     # Encode categorical variables with separate encoders
    le = LabelEncoder()
    le_product = LabelEncoder()
    le_tdc = LabelEncoder()

    df['Actual Product'] = le_product.fit_transform(df['Actual Product'])
    df['Actual Tdc'] = le_tdc.fit_transform(df['Actual Tdc']) 
    
    # Select features and target
    features = ['Actual Product', 'Actual Tdc', 'Op Width', 'Zinc', 'Op Thk']
    targets = ['Process Duration(in min)', 'productivity']
    
    # Ensure all required columns exist
    missing_cols = [col for col in features + targets if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")

     # Filter out rows with NaN or inf in target columns
    df = df.replace([np.inf, -np.inf], np.nan)  # Replace inf/-inf with NaN
    df = df.dropna(subset=targets)  # Drop rows with NaN in target columns
    
    X = df[features]
    y_time = df['Process Duration(in min)']
    y_prod = df['productivity']
    
    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y_time, y_prod, scaler, le_product, le_tdc


# Train the models
def train_models(X, y_time, y_prod):
    # Split data
    X_train, X_test, y_time_train, y_time_test = train_test_split(
        X, y_time, test_size=0.2, random_state=42)
    
    _, _, y_prod_train, y_prod_test = train_test_split(
        X, y_prod, test_size=0.2, random_state=42)
    
    # Time prediction model
    time_model = RandomForestRegressor(n_estimators=100, random_state=42)
    time_model.fit(X_train, y_time_train)
    time_pred = time_model.predict(X_test)
    print(f"Time MAE: {mean_absolute_error(y_time_test, time_pred):.2f} minutes")
    
    # Productivity prediction model
    prod_model = RandomForestRegressor(n_estimators=100, random_state=42)
    prod_model.fit(X_train, y_prod_train)
    prod_pred = prod_model.predict(X_test)
    print(f"Productivity MAE: {mean_absolute_error(y_prod_test, prod_pred):.2f} TPH")
    
    return time_model, prod_model

# Save models and preprocessing objects
def save_artifacts(time_model, prod_model, scaler, le_product, le_tdc):
    artifacts_dir = r'C:\Users\Dell\Desktop\new dp\DP\front\models'
    os.makedirs(artifacts_dir, exist_ok=True)

    joblib.dump(time_model, os.path.join(artifacts_dir, 'time_model.joblib'))
    joblib.dump(prod_model, os.path.join(artifacts_dir, 'prod_model.joblib'))
    joblib.dump(scaler, os.path.join(artifacts_dir, 'scaler.joblib'))
    joblib.dump(le_product, os.path.join(artifacts_dir, 'le_product.joblib'))
    joblib.dump(le_tdc, os.path.join(artifacts_dir, 'le_tdc.joblib'))


# Main training function
def train_pipeline():
    print("Loading and preprocessing data...")
    X, y_time, y_prod, scaler, le_product, le_tdc = preprocess_data(df)
    
    print("\nTraining models...")
    time_model, prod_model = train_models(X, y_time, y_prod)
    
    print("\nSaving artifacts...")
    save_artifacts(time_model, prod_model, scaler, le_product, le_tdc)
    print("Training complete! Models saved to models directory")

# Prediction function
def predict(Actual_Product, Order_Tdc, Op_Thk, Op_Width, Zinc):
    artifacts_dir = r'C:\Users\Dell\Desktop\new dp\DP\front\models'
    try:
        time_model = joblib.load(os.path.join(artifacts_dir, 'time_model.joblib'))
        prod_model = joblib.load(os.path.join(artifacts_dir, 'prod_model.joblib'))
        scaler = joblib.load(os.path.join(artifacts_dir, 'scaler.joblib'))
        le_product = joblib.load(os.path.join(artifacts_dir, 'le_product.joblib'))
        le_tdc = joblib.load(os.path.join(artifacts_dir, 'le_tdc.joblib'))
    except FileNotFoundError:
        raise FileNotFoundError("Models not found. Please train the models first.")
    
    input_data = pd.DataFrame({
        'Actual Product': [Actual_Product],
        'Actual Tdc': [Order_Tdc],
        'Op Width': [Op_Width],
        'Zinc': [Zinc],
        'Op Thk': [Op_Thk]
    })

    # Encode both categorical fields
    input_data['Actual Product'] = le_product.transform(input_data['Actual Product'])
    input_data['Actual Tdc'] = le_tdc.transform(input_data['Actual Tdc'])

    input_scaled = scaler.transform(input_data)

    time_required = time_model.predict(input_scaled)[0]
    productivity = prod_model.predict(input_scaled)[0]
    
    return {
        'Time Required (minutes)': round(time_required, 2),
        'Productivity (TPH)': round(productivity, 2)
    }

# Execute training
train_pipeline()

# Example prediction
print("\nExample Prediction:")
prediction = predict('GI', 'ZAP130', 2.0, 1250, 120)
print(prediction)

Loading and preprocessing data...

Training models...
Time MAE: 7.78 minutes
Productivity MAE: 10.88 TPH

Saving artifacts...
Training complete! Models saved to models directory

Example Prediction:
{'Time Required (minutes)': 10.02, 'Productivity (TPH)': 43.67}
