In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# === Load dataset ===
df = pd.read_csv('final_complaints.csv')
df = df[['complaint_text', 'type', 'area', 'department']].dropna()

# === Clean text to improve model's understanding ===
def clean_text(text):
    import re
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))  # Remove special chars
    text = text.lower()
    return text.strip()

df['complaint_text'] = df['complaint_text'].apply(clean_text)

# === Encode target ===
label_encoder = LabelEncoder()
df['department_encoded'] = label_encoder.fit_transform(df['department'])

# === Define features and target ===
X = df[['complaint_text', 'type', 'area']]
y = df['department_encoded']

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# === Strongly emphasize 'type' by replicating feature ===
X_train['type_emphasis'] = X_train['type']
X_test['type_emphasis'] = X_test['type']

# === Preprocessing ===
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=7000, ngram_range=(1, 3), stop_words='english'), 'complaint_text'),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), ['type', 'type_emphasis', 'area'])
    ]
)

# === XGBoost Model ===
xgb_model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    n_estimators=300,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method='hist'
)

# === Full Pipeline ===
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

# === Train the pipeline ===
pipeline.fit(X_train, y_train)

# === Evaluation ===
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Save model and encoder ===
joblib.dump(pipeline, 'department_classifier.pkl')
joblib.dump(label_encoder, 'department_label_encoder.pkl')

print("✔️ Department classifier trained, emphasized on 'type', and saved as .pkl")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


                                              precision    recall  f1-score   support

       Animal Control and Welfare Department       1.00      1.00      1.00       106
         Electricity Distribution Department       1.00      1.00      1.00        99
         Environmental Protection Department       1.00      1.00      1.00       101
       Municipal Waste Management Department       1.00      1.00      1.00       105
                     Public Works Department       1.00      1.00      1.00       107
            Sewerage and Drainage Department       1.00      1.00      1.00        91
       Traffic and Transportation Department       1.00      1.00      1.00        96
   Urban Forestry and Landscaping Department       1.00      1.00      1.00        97
Urban Lighting and Infrastructure Department       1.00      1.00      1.00        97
      Water Supply and Sanitation Department       1.00      1.00      1.00       101

                                    accuracy        

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

# === Load Data ===
df = pd.read_csv('final_complaints.csv')

# === Filter & Clean ===
df = df[['complaint_text', 'type', 'area', 'predicted_priority']].dropna()

# Encode target
label_encoder = LabelEncoder()
df['priority_encoded'] = label_encoder.fit_transform(df['predicted_priority'])

# Features & Target
X = df[['complaint_text', 'type', 'area']]
y = df['priority_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# === Preprocessing ===
text_feature = 'complaint_text'
cat_features = ['type', 'area']

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), text_feature),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

# === Model ===
model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    n_estimators=250,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.9,
    random_state=42,
    tree_method='hist'
)

# === Full Pipeline ===
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# === Training ===
pipeline.fit(X_train, y_train)

# === Evaluation ===
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"🔍 Accuracy: {acc*100:.2f}%\n")
print("🔍 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Save Model ===
joblib.dump(pipeline, 'newpriority_predictor.pkl')
joblib.dump(label_encoder, 'newpriority_label_encoder.pkl')


🔍 Accuracy: 100.00%

🔍 Classification Report:

              precision    recall  f1-score   support

        High       1.00      1.00      1.00       495
         Low       1.00      1.00      1.00       207
      Medium       1.00      1.00      1.00       298

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



['newpriority_label_encoder.pkl']

In [3]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('final_complaints.csv')

# Mapping for department
department_mapping = {
    "Garbage Issue": "Municipal Waste Management Department",
    "Electricity Issue": "Electricity Distribution Department",
    "Illegal Parking": "Traffic and Transportation Department",
    "Tree Falling": "Urban Forestry and Landscaping Department",
    "Road Damage": "Public Works Department",
    "Animal Nuisance": "Animal Control and Welfare Department",
    "Streetlight Issue": "Urban Lighting and Infrastructure Department",
    "Noise Pollution": "Environmental Protection Department",
    "Water Issue": "Water Supply and Sanitation Department",
    "Sewage Problem": "Sewerage and Drainage Department"
}

# Mapping for predicted priority
priority_mapping = {
    "Garbage Issue": "Medium",
    "Electricity Issue": "High",
    "Illegal Parking": "Medium",
    "Tree Falling": "High",
    "Road Damage": "High",
    "Animal Nuisance": "Low",
    "Streetlight Issue": "Medium",
    "Noise Pollution": "Low",
    "Water Issue": "High",
    "Sewage Problem": "High"
}

# Apply mappings to update the columns
df['department'] = df['type'].map(department_mapping)
df['predicted_priority'] = df['type'].map(priority_mapping)

# Save the updated CSV
df.to_csv('final_complaints.csv', index=False)


PermissionError: [Errno 13] Permission denied: 'final_complaints.csv'

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# Load the dataset
df = pd.read_csv('final_complaints.csv')

# Ensure the date columns are in datetime format
df['filing_date'] = pd.to_datetime(df['filing_date'], errors='coerce')
df['resolved_date'] = pd.to_datetime(df['resolved_date'], errors='coerce')

# Handle missing values (if any)
df.dropna(subset=['filing_date', 'resolved_date'], inplace=True)

# Create the target variable: Number of days to resolve the complaint
df['resolved_in_days'] = (df['resolved_date'] - df['filing_date']).dt.days

# === Feature Engineering ===
# Extract features from the filing date (optional)
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_weekday'] = df['filing_date'].dt.weekday

# Encode categorical features
label_encoder = LabelEncoder()
df['type_encoded'] = label_encoder.fit_transform(df['type'])
df['department_encoded'] = label_encoder.fit_transform(df['department'])
df['predicted_priority_encoded'] = label_encoder.fit_transform(df['predicted_priority'])
df['area_encoded'] = label_encoder.fit_transform(df['area'])

# === Features & Target ===
X = df[['area_encoded', 'filing_day', 'filing_month', 'filing_weekday', 'type_encoded', 'department_encoded', 'predicted_priority_encoded']]
y = df['resolved_in_days']  # Target: Number of days to resolve the complaint

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Model: Random Forest Regressor ===
model = Pipeline([
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# === Evaluation ===
y_pred = model.predict(X_test)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)

# Calculate R-squared
r2 = r2_score(y_test, y_pred)

# Displaying the results
print(f"🔍 Mean Absolute Error (MAE): {mae:.2f}")
print(f"🔍 R² Score: {r2:.4f}")

# === Save the Model ===
joblib.dump(model, 'eta_predictor.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


KeyError: 'resolved_date'

In [12]:
print(df.columns)



Index(['complaint_id', 'complaint_text', 'area', 'type', 'filing_date',
       'department', 'predicted_priority', 'status', 'area_eta', 'type_eta',
       'resolved_days', 'resolution_date', 'predicted_priority_eta', 'rating',
       'feedback_text', 'resolution_status', 'actual_cost', 'resolved_on',
       'predicted_cost', 'actual_cost_anomaly', 'variance_flag',
       'type_anomaly', 'area_anomaly', 'resolved_days_new',
       'resolution_date_new', 'cost_difference', 'cost_diff_percent'],
      dtype='object')


In [20]:
import pandas as pd

# Load the dataset
complaints_df = pd.read_csv('final_complaints.csv')

# Convert 'filing_date' and 'resolution_date_new' to datetime if not already in datetime format
complaints_df['filing_date'] = pd.to_datetime(complaints_df['filing_date'], errors='coerce')
complaints_df['resolution_date_new'] = pd.to_datetime(complaints_df['resolution_date_new'], errors='coerce')

# Calculate the number of days between 'filing_date' and 'resolution_date_new'
complaints_df['days_to_resolution'] = (complaints_df['resolution_date_new'] - complaints_df['filing_date']).dt.days

# Now calculate the correlation between 'resolved_days_new' and 'days_to_resolution'
correlation = complaints_df['resolved_days_new'].corr(complaints_df['days_to_resolution'])

print(f"The correlation between 'resolved_days_new' and 'days_to_resolution' is: {correlation:.2f}")


The correlation between 'resolved_days_new' and 'days_to_resolution' is: 1.00


In [16]:
import pandas as pd

# Assuming your DataFrame is loaded as complaints_df

# Convert 'filing_date' to datetime if it's not already in datetime format
complaints_df['filing_date'] = pd.to_datetime(complaints_df['filing_date'], errors='coerce')

# Calculate the 'resolution_date_new' by adding 'resolved_days_new' to 'filing_date'
complaints_df['resolution_date_new'] = complaints_df['filing_date'] + pd.to_timedelta(complaints_df['resolved_days_new'], unit='D')

# Save the updated dataframe to a new CSV file (or overwrite the existing one)
complaints_df.to_csv('final_complaints.csv', index=False)

# Print the first few rows to check the result
print(complaints_df[['filing_date', 'resolved_days_new', 'resolution_date_new']].head())


  filing_date  resolved_days_new resolution_date_new
0  2024-01-13                 16          2024-01-29
1  2024-02-28                  0          2024-02-28
2  2024-02-03                 14          2024-02-17
3  2024-04-09                  0          2024-04-09
4  2024-01-20                  0          2024-01-20


In [17]:
import pandas as pd

# Assuming your DataFrame is loaded as complaints_df

# Convert 'filing_date' to datetime if it's not already in datetime format
complaints_df['filing_date'] = pd.to_datetime(complaints_df['filing_date'], errors='coerce')

# Apply the condition: if 'resolved_days_new' is 0, set 'resolution_date_new' as NaT (empty)
complaints_df['resolution_date_new'] = complaints_df.apply(
    lambda row: row['filing_date'] + pd.to_timedelta(row['resolved_days_new'], unit='D') 
    if row['resolved_days_new'] > 0 else pd.NaT, axis=1
)

# Save the updated dataframe to a new CSV file (or overwrite the existing one)
complaints_df.to_csv('final_complaints.csv', index=False)

# Print the first few rows to check the result
print(complaints_df[['filing_date', 'resolved_days_new', 'resolution_date_new']].head())


  filing_date  resolved_days_new resolution_date_new
0  2024-01-13                 16          2024-01-29
1  2024-02-28                  0                 NaT
2  2024-02-03                 14          2024-02-17
3  2024-04-09                  0                 NaT
4  2024-01-20                  0                 NaT


In [33]:
import pandas as pd
import numpy as np  # For RMSE calculation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Step 1: Load Data
df = pd.read_csv('final_complaints.csv')

# Step 2: Feature Engineering
# Ensure the date columns are in datetime format
df['filing_date'] = pd.to_datetime(df['filing_date'], errors='coerce')

# If 'resolved_date' column is not available, create it based on your business logic
default_resolution_days = 7
df['resolved_date'] = df['filing_date'] + pd.to_timedelta(default_resolution_days, unit='D')

# Create 'resolved_days' as the number of days between 'filing_date' and 'resolved_date'
df['resolved_days'] = (df['resolved_date'] - df['filing_date']).dt.days

# Extract date features
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_weekday'] = df['filing_date'].dt.weekday

# Encode categorical features using pre-trained LabelEncoders
le_area = joblib.load('labelencoder_area.pkl')
df['area_encoded'] = le_area.transform(df['area'])

le_type = joblib.load('labelencoder_type.pkl')
df['type_encoded'] = le_type.transform(df['type'])

le_department = joblib.load('labelencoder_department.pkl')
df['department_encoded'] = le_department.transform(df['department'])

le_priority = joblib.load('labelencoder_priority.pkl')
df['predicted_priority_encoded'] = le_priority.transform(df['predicted_priority'])

# Step 3: Define Features and Targets
X = df[['area_encoded', 'filing_day', 'filing_month', 'filing_weekday', 
        'type_encoded', 'department_encoded', 'predicted_priority_encoded']]

y_resolved_date = df['resolved_date']
y_resolved_days = df['resolved_days']

# Step 4: Train-Test Split
X_train, X_test, y_resolved_date_train, y_resolved_date_test = train_test_split(X, y_resolved_date, test_size=0.2, random_state=42)
X_train2, X_test2, y_resolved_days_train, y_resolved_days_test = train_test_split(X, y_resolved_days, test_size=0.2, random_state=42)

# Step 5: Train Models
model_resolved_date = LinearRegression()
model_resolved_date.fit(X_train, y_resolved_date_train)

model_resolved_days = LinearRegression()
model_resolved_days.fit(X_train2, y_resolved_days_train)

# Step 6: Model Evaluation
# Evaluate Resolved Date Model
y_resolved_date_pred = model_resolved_date.predict(X_test)
mae_resolved_date = mean_absolute_error(y_resolved_date_test, y_resolved_date_pred)
mse_resolved_date = mean_squared_error(y_resolved_date_test, y_resolved_date_pred)
rmse_resolved_date = np.sqrt(mse_resolved_date)

# Evaluate Resolved Days Model
y_resolved_days_pred = model_resolved_days.predict(X_test2)
mae_resolved_days = mean_absolute_error(y_resolved_days_test, y_resolved_days_pred)
mse_resolved_days = mean_squared_error(y_resolved_days_test, y_resolved_days_pred)
rmse_resolved_days = np.sqrt(mse_resolved_days)

# Step 7: Print Evaluation Metrics
print("Resolved Date Model Evaluation:")
print(f"MAE: {mae_resolved_date}")
print(f"MSE: {mse_resolved_date}")
print(f"RMSE: {rmse_resolved_date}")

print("\nResolved Days Model Evaluation:")
print(f"MAE: {mae_resolved_days}")
print(f"MSE: {mse_resolved_days}")
print(f"RMSE: {rmse_resolved_days}")

# Step 8: Save Models
joblib.dump(model_resolved_date, 'resolved_date_predictor.pkl')
joblib.dump(model_resolved_days, 'resolved_days_predictor.pkl')

print("Models saved successfully!")


Resolved Date Model Evaluation:
MAE: 36817386979869.19
MSE: 1.5992461924486187e+27
RMSE: 39990576295530.16

Resolved Days Model Evaluation:
MAE: 0.0
MSE: 0.0
RMSE: 0.0
Models saved successfully!


In [4]:
from sklearn.preprocessing import LabelEncoder
import os

# Recreate or load label encoder for 'area'
if os.path.exists('labelencoder_area.pkl'):
    le_area = joblib.load('labelencoder_area.pkl')
else:
    le_area = LabelEncoder()
    df['area_encoded'] = le_area.fit_transform(df['area'])
    joblib.dump(le_area, 'labelencoder_area.pkl')

# Recreate or load label encoder for 'type'
if os.path.exists('labelencoder_type.pkl'):
    le_type = joblib.load('labelencoder_type.pkl')
else:
    le_type = LabelEncoder()
    df['type_encoded'] = le_type.fit_transform(df['type'])
    joblib.dump(le_type, 'labelencoder_type.pkl')

# Recreate or load label encoder for 'department'
if os.path.exists('labelencoder_department.pkl'):
    le_department = joblib.load('labelencoder_department.pkl')
else:
    le_department = LabelEncoder()
    df['department_encoded'] = le_department.fit_transform(df['department'])
    joblib.dump(le_department, 'labelencoder_department.pkl')

# Recreate or load label encoder for 'predicted_priority'
if os.path.exists('labelencoder_priority.pkl'):
    le_priority = joblib.load('labelencoder_priority.pkl')
else:
    le_priority = LabelEncoder()
    df['predicted_priority_encoded'] = le_priority.fit_transform(df['predicted_priority'])
    joblib.dump(le_priority, 'labelencoder_priority.pkl')


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib

# Step 1: Load Data
df = pd.read_csv('final_complaints.csv')

# Step 2: Feature Engineering
# Ensure the date columns are in datetime format
df['filing_date'] = pd.to_datetime(df['filing_date'], errors='coerce')

# If 'resolved_date' column is not available, create it based on your business logic
# Assuming default resolution time is 7 days for complaints
default_resolution_days = 7  # Customize this duration based on business rules
df['resolved_date'] = df['filing_date'] + pd.to_timedelta(default_resolution_days, unit='D')

# Create 'resolved_days' as the number of days between 'filing_date' and 'resolved_date'
df['resolved_days'] = (df['resolved_date'] - df['filing_date']).dt.days

# Extract additional features from the filing_date
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_weekday'] = df['filing_date'].dt.weekday

# Encode categorical features using pre-trained LabelEncoders
le_area = joblib.load('labelencoder_area.pkl')  # Load the existing label encoder for 'area'
df['area_encoded'] = le_area.transform(df['area'])

le_type = joblib.load('labelencoder_type.pkl')  # Load the existing label encoder for 'type'
df['type_encoded'] = le_type.transform(df['type'])

le_department = joblib.load('labelencoder_department.pkl')  # Load the existing label encoder for 'department'
df['department_encoded'] = le_department.transform(df['department'])

le_priority = joblib.load('labelencoder_priority.pkl')  # Load the existing label encoder for 'predicted_priority'
df['predicted_priority_encoded'] = le_priority.transform(df['predicted_priority'])

# Step 3: Define Features and Targets
# Define Features (independent variables)
X = df[['area_encoded', 'filing_day', 'filing_month', 'filing_weekday', 
        'type_encoded', 'department_encoded', 'predicted_priority_encoded']]

# Define Targets (dependent variables)
y_resolved_date = df['resolved_date']  # Target: Exact resolved date
y_resolved_days = df['resolved_days']  # Target: Number of days taken to resolve

# Step 4: Train-Test Split for both models
X_train, X_test, y_resolved_date_train, y_resolved_date_test = train_test_split(X, y_resolved_date, test_size=0.2, random_state=42)
X_train2, X_test2, y_resolved_days_train, y_resolved_days_test = train_test_split(X, y_resolved_days, test_size=0.2, random_state=42)

# Step 5: Train Models
# Model 1: Random Forest for resolving date
model_resolved_date = RandomForestRegressor(n_estimators=100, random_state=42)
model_resolved_date.fit(X_train, y_resolved_date_train)

# Model 2: Random Forest for resolving days
model_resolved_days = RandomForestRegressor(n_estimators=100, random_state=42)
model_resolved_days.fit(X_train2, y_resolved_days_train)

# Step 6: Model Evaluation
# Evaluate the resolved date model
y_resolved_date_pred = model_resolved_date.predict(X_test)
mae_resolved_date = mean_absolute_error(y_resolved_date_test, y_resolved_date_pred)
mse_resolved_date = mean_squared_error(y_resolved_date_test, y_resolved_date_pred)
rmse_resolved_date = mse_resolved_date ** 0.5  # Manually calculating RMSE

# Evaluate the resolved days model
y_resolved_days_pred = model_resolved_days.predict(X_test2)
mae_resolved_days = mean_absolute_error(y_resolved_days_test, y_resolved_days_pred)
mse_resolved_days = mean_squared_error(y_resolved_days_test, y_resolved_days_pred)
rmse_resolved_days = mse_resolved_days ** 0.5  # Manually calculating RMSE

print(f"Resolved Date Model Evaluation:")
print(f"MAE: {mae_resolved_date}")
print(f"MSE: {mse_resolved_date}")
print(f"RMSE: {rmse_resolved_date}")

print(f"\nResolved Days Model Evaluation:")
print(f"MAE: {mae_resolved_days}")
print(f"MSE: {mse_resolved_days}")
print(f"RMSE: {rmse_resolved_days}")

# Step 7: Cross-validation to avoid overfitting
cv_scores_resolved_date = cross_val_score(model_resolved_date, X, y_resolved_date, cv=5, scoring='neg_mean_absolute_error')
cv_scores_resolved_days = cross_val_score(model_resolved_days, X, y_resolved_days, cv=5, scoring='neg_mean_absolute_error')

print(f"\nCross-validation Results (Resolved Date):")
print(f"CV MAE: {-cv_scores_resolved_date.mean()}")

print(f"\nCross-validation Results (Resolved Days):")
print(f"CV MAE: {-cv_scores_resolved_days.mean()}")

# Step 8: Save Models
joblib.dump(model_resolved_date, 'resolved_date_predictor.pkl')
joblib.dump(model_resolved_days, 'resolved_days_predictor.pkl')

print("Models saved successfully!")


Resolved Date Model Evaluation:
MAE: 0.0
MSE: 0.0
RMSE: 0.0

Resolved Days Model Evaluation:
MAE: 0.0
MSE: 0.0
RMSE: 0.0

Cross-validation Results (Resolved Date):
CV MAE: -0.0

Cross-validation Results (Resolved Days):
CV MAE: -0.0
Models saved successfully!


In [37]:
print("True Resolved Date Test Values:\n", y_resolved_date_test.head())
print("Predicted Resolved Date Values:\n", y_resolved_date_pred[:5])


True Resolved Date Test Values:
 1501   2024-01-14
2586   2024-02-03
2653   2024-02-29
1055   2024-03-30
705    2024-03-19
Name: resolved_date, dtype: datetime64[ns]
Predicted Resolved Date Values:
 [1.7051904e+18 1.7069184e+18 1.7091648e+18 1.7117568e+18 1.7108064e+18]


In [38]:
print("Target Variable (y_resolved_date):", y_resolved_date.head())


Target Variable (y_resolved_date): 0   2024-01-20
1   2024-03-06
2   2024-02-10
3   2024-04-16
4   2024-01-27
Name: resolved_date, dtype: datetime64[ns]


In [39]:
print("Check for NaN in target variables:", df[['resolved_date', 'resolved_days']].isna().sum())


Check for NaN in target variables: resolved_date    0
resolved_days    0
dtype: int64


In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

# Sample data loading (replace with actual data)
# df = pd.read_csv('your_dataset.csv')

# Convert date columns to datetime if they are not already
df['filing_date'] = pd.to_datetime(df['filing_date'])
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])

# Create a new column for the difference in days between filing_date and resolution_date_new
df['days_between_filing_resolution'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# Encode categorical variables (type, area, department)
label_encoder_type = LabelEncoder()
df['type_encoded'] = label_encoder_type.fit_transform(df['type'])

label_encoder_area = LabelEncoder()
df['area_encoded'] = label_encoder_area.fit_transform(df['area'])

label_encoder_department = LabelEncoder()
df['department_encoded'] = label_encoder_department.fit_transform(df['department'])

# Now, let's check the correlation of categorical features (encoded) with the target variable 'resolution_date_new'
# We will calculate Pearson correlation for numeric columns and Chi-square for categorical ones

# Pearson correlation for numeric columns
numeric_columns = ['days_between_filing_resolution']  # As filing_date and resolution_date_new are dates, we use the days difference
for column in numeric_columns:
    correlation = df[column].corr(df['days_between_filing_resolution'])
    print(f"Pearson Correlation between {column} and resolution_date_new: {correlation}")

# Chi-square test for categorical variables vs target ('resolution_date_new')
# For categorical features, we will perform a chi-square test on the contingency table between categorical variables and the target variable

categorical_columns = ['type_encoded', 'area_encoded', 'department_encoded']

for column in categorical_columns:
    # Create a contingency table between the categorical feature and the target variable
    contingency_table = pd.crosstab(df[column], df['days_between_filing_resolution'])
    
    # Perform chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    print(f"Chi-Square Test for {column} vs resolution_date_new:")
    print(f"Chi2 Statistic: {chi2}, P-value: {p}\n")



Pearson Correlation between days_between_filing_resolution and resolution_date_new: 0.9999999999999998
Chi-Square Test for type_encoded vs resolution_date_new:
Chi2 Statistic: 210.26215123767219, P-value: 0.9739849059565284

Chi-Square Test for area_encoded vs resolution_date_new:
Chi2 Statistic: 818.0152022738928, P-value: 0.434302144212549

Chi-Square Test for department_encoded vs resolution_date_new:
Chi2 Statistic: 210.26215123767216, P-value: 0.9739849059565284



In [41]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

# Assuming df is your dataframe and it already contains the necessary columns
# df = pd.read_csv('your_data.csv')  # Load your dataset

# Convert the date columns to datetime if not already
df['filing_date'] = pd.to_datetime(df['filing_date'])
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])

# Calculate the difference in days between 'filing_date' and 'resolution_date_new'
df['days_between_filing_resolution'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# Encode categorical variables (type, area, department) using LabelEncoder
label_encoder_type = LabelEncoder()
df['type_encoded'] = label_encoder_type.fit_transform(df['type'])

label_encoder_area = LabelEncoder()
df['area_encoded'] = label_encoder_area.fit_transform(df['area'])

label_encoder_department = LabelEncoder()
df['department_encoded'] = label_encoder_department.fit_transform(df['department'])

# Now, we will check the correlation for categorical and numerical variables with 'resolution_date_new'

# 1. Pearson correlation for numerical features (filing_date and resolution_date_new)
# Since 'filing_date' and 'resolution_date_new' are dates, we will use the difference in days

# Pearson correlation between days difference and the target variable (resolution_date_new)
correlation_filing_resolution = df['days_between_filing_resolution'].corr(df['days_between_filing_resolution'])
print(f"Pearson Correlation between filing_date and resolution_date_new (Days Difference): {correlation_filing_resolution}")

# 2. Chi-Square test for categorical features (type, area, department)
categorical_columns = ['type_encoded', 'area_encoded', 'department_encoded']

for column in categorical_columns:
    # Create a contingency table between the categorical feature and the target variable (days difference)
    contingency_table = pd.crosstab(df[column], df['days_between_filing_resolution'])
    
    # Perform chi-square test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    print(f"Chi-Square Test for {column} vs resolution_date_new (Days Difference):")
    print(f"Chi2 Statistic: {chi2}, P-value: {p}\n")



Pearson Correlation between filing_date and resolution_date_new (Days Difference): 0.9999999999999998
Chi-Square Test for type_encoded vs resolution_date_new (Days Difference):
Chi2 Statistic: 210.26215123767219, P-value: 0.9739849059565284

Chi-Square Test for area_encoded vs resolution_date_new (Days Difference):
Chi2 Statistic: 818.0152022738928, P-value: 0.434302144212549

Chi-Square Test for department_encoded vs resolution_date_new (Days Difference):
Chi2 Statistic: 210.26215123767216, P-value: 0.9739849059565284



In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib

# 1. Load the data (Assume df is already loaded with required columns)
# df = pd.read_csv('your_data.csv') # Un-comment and modify this to load your data

# 2. Convert 'filing_date' and 'resolution_date_new' to datetime if they are not
df['filing_date'] = pd.to_datetime(df['filing_date'])
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])

# 3. Calculate the target variable (days between filing_date and resolution_date_new)
df['resolved_days_new'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# 4. Prepare features (filing_date, resolved_days_new) and target (resolution_date_new)
df['filing_date_numeric'] = df['filing_date'].astype(np.int64) // 10**9  # Convert filing_date to seconds
X = df[['filing_date_numeric', 'resolved_days_new']]  # Features
y = df['resolution_date_new']  # Target

# 5. Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train a RandomForestRegressor model (or any other model you prefer)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. Predictions on test data
y_pred = model.predict(X_test)

# 8. Model evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Model Evaluation:")
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}")

# 9. Save the trained model to a file using joblib
joblib.dump(model, 'ETA_predictor.pkl')

print("Model saved successfully as 'ETA_predictor.pkl'.")



Model Evaluation:
MAE: 570659506206853.1
MSE: 4.601564749587255e+31
RMSE: 6783483433743503.0
MAPE: 6.819904699052829e-05
Model saved successfully as 'ETA_predictor.pkl'.


In [43]:
from sklearn.preprocessing import StandardScaler

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Fit and transform your features

# Use scaled features for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [45]:
import pandas as pd

# Assuming df is your dataframe
df['filing_date'] = pd.to_datetime(df['filing_date'], errors='coerce')


In [46]:
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_year'] = df['filing_date'].dt.year
df['filing_day_of_week'] = df['filing_date'].dt.weekday
df['filing_quarter'] = df['filing_date'].dt.quarter


In [47]:
# Fill missing dates with a default date (e.g., the first day of the year)
df['filing_date'].fillna(pd.to_datetime('2024-01-01'), inplace=True)


In [48]:
X = df[['filing_day', 'filing_month', 'filing_year', 'filing_day_of_week', 'filing_quarter', 'resolved_days_new']]
y = df['resolution_date_new']  # Target variable


In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [50]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)


In [51]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Example DataFrame (replace with your data)
df = pd.DataFrame({
    'filing_date': ['2024-01-01', '2024-02-15', '2024-03-10'],
    'resolved_days_new': [30, 60, 45],
    'resolution_date_new': ['2024-02-01', '2024-04-15', '2024-04-25']
})

# Convert filing_date to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Extract day, month, year, weekday, and quarter from filing_date
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_year'] = df['filing_date'].dt.year
df['filing_day_of_week'] = df['filing_date'].dt.weekday
df['filing_quarter'] = df['filing_date'].dt.quarter

# Target variable (convert resolution_date_new to days)
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])
df['days_to_resolution'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# Prepare features (X) and target (y)
X = df[['filing_day', 'filing_month', 'filing_year', 'filing_day_of_week', 'filing_quarter', 'resolved_days_new']]
y = df['days_to_resolution']  # Target is the number of days to resolution

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# Save the model
import joblib
joblib.dump(model, 'ETA_predictor.pkl')

# Check the model's performance
predictions = model.predict(X_scaled)
print(predictions)


[37.72 53.28 47.57]


In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import joblib
import warnings

warnings.filterwarnings('ignore')

# -------------------- Load & Prepare Data ---------------------
# df = pd.read_csv("your_data.csv")  # Uncomment if using CSV

df['filing_date'] = pd.to_datetime(df['filing_date'])
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])
df['resolved_days_new'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# -------------------- Feature Engineering ---------------------
df['filing_date_numeric'] = df['filing_date'].astype(np.int64) // 10**9
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_weekday'] = df['filing_date'].dt.weekday

# Features and Target
X = df[['filing_date_numeric', 'filing_day', 'filing_month', 'filing_weekday']]
y = df['resolved_days_new']

# -------------------- Train-Test Split ---------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------- Smart CV Split ---------------------
n_samples = len(X_train)
cv_splits = min(5, n_samples if n_samples > 1 else 2)
cv = KFold(n_splits=cv_splits)

# -------------------- Hyperparameter Tuning ---------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=cv,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# -------------------- Evaluation ---------------------
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("\n🔍 Model Evaluation (Sharply Tuned):")
print(f"MAE  : {mae:.2f}")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"MAPE : {mape:.4f}")

# -------------------- Save Model ---------------------
joblib.dump(best_model, 'ETA_days_predictor_optimized.pkl')
print("\n✅ Optimized Model saved as 'ETA_days_predictor_optimized.pkl'")



🔍 Model Evaluation (Sharply Tuned):
MAE  : 23.68
MSE  : 560.74
RMSE : 23.68
MAPE : 0.7639

✅ Optimized Model saved as 'ETA_days_predictor_optimized.pkl'


In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

# -------------------- Load Data ---------------------
df = pd.read_csv('final_complaints_updated.csv')  # Uncomment if loading fresh

# Convert date columns
df['filing_date'] = pd.to_datetime(df['filing_date'])
df['resolution_date_new'] = pd.to_datetime(df['resolution_date_new'])

# Calculate target
df['resolved_days_new'] = (df['resolution_date_new'] - df['filing_date']).dt.days

# -------------------- Feature Engineering ---------------------

# Date-related features
df['filing_date_numeric'] = df['filing_date'].astype(np.int64) // 10**9
df['filing_day'] = df['filing_date'].dt.day
df['filing_month'] = df['filing_date'].dt.month
df['filing_weekday'] = df['filing_date'].dt.weekday

# Encode categorical variables
label_encoders = {}
for col in ['area', 'type', 'department', 'predicted_priority']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder for future use (if needed in deployment)

# -------------------- Define Features and Target ---------------------
features = [
    'filing_date_numeric', 'filing_day', 'filing_month', 'filing_weekday',
    'area', 'type', 'department', 'predicted_priority'
]
X = df[features]
y = df['resolved_days_new']

# -------------------- Train-Test Split ---------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------- Hyperparameter Tuning ---------------------
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# -------------------- Evaluate ---------------------
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, y_pred)

print("\n🔍 Model Evaluation (With More Features):")
print(f"MAE  : {mae:.2f} days")
print(f"MSE  : {mse:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"MAPE : {mape:.4f}")

# -------------------- Save Model ---------------------
joblib.dump(best_model, 'ETA_days_predictor_extended.pkl')
print("\n✅ Improved Model saved as 'ETA_days_predictor_extended.pkl'.")


ValueError: 
All the 120 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1387, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1397, in _check_y
    y = check_array(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1107, in check_array
    _assert_all_finite(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input y contains NaN.


In [55]:
print(df.columns.tolist())


['filing_date', 'resolved_days_new', 'resolution_date_new', 'filing_day', 'filing_month', 'filing_year', 'filing_day_of_week', 'filing_quarter', 'days_to_resolution', 'filing_date_numeric', 'filing_weekday']


In [57]:
# Drop rows where 'resolved_days_new' is NaN
df = df.dropna(subset=['resolved_days_new'])


In [58]:
X = df.drop(columns=['resolved_days_new', 'resolution_date_new'])  # Drop target & unused date column
y = df['resolved_days_new']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [59]:
grid_search.fit(X_train, y_train)


ValueError: 
All the 120 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 973, in check_array
    array = array.astype(new_dtype)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 6643, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py", line 430, in astype
    return self.apply(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\blocks.py", line 758, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 133, in _astype_nansafe
    return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'C4074'

--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 360, in fit
    X, y = validate_data(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 973, in check_array
    array = array.astype(new_dtype)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\generic.py", line 6643, in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py", line 430, in astype
    return self.apply(
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\internals\blocks.py", line 758, in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 237, in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 182, in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
  File "C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\dtypes\astype.py", line 133, in _astype_nansafe
    return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'C0337'


In [60]:
# Check for non-numeric columns in the dataset
non_numeric_cols = X.select_dtypes(include=['object']).columns
print(non_numeric_cols)


Index(['complaint_id', 'complaint_text', 'status', 'area_eta', 'type_eta',
       'resolution_date', 'predicted_priority_eta', 'feedback_text',
       'resolution_status', 'resolved_on', 'variance_flag', 'type_anomaly',
       'area_anomaly'],
      dtype='object')


In [61]:
from sklearn.preprocessing import LabelEncoder

# Example for encoding a categorical column
label_encoder = LabelEncoder()
X['categorical_column'] = label_encoder.fit_transform(X['categorical_column'])


KeyError: 'categorical_column'