In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

# === Load Data ===
df = pd.read_csv('final_complaints.csv')

# === Filter & Clean ===
df = df[['complaint_text', 'type', 'area', 'predicted_priority']].dropna()

# Encode target
label_encoder = LabelEncoder()
df['priority_encoded'] = label_encoder.fit_transform(df['predicted_priority'])

# Features & Target
X = df[['complaint_text', 'type', 'area']]
y = df['priority_encoded']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42)

# === Preprocessing ===
text_feature = 'complaint_text'
cat_features = ['type', 'area']

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), text_feature),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

# === Model ===
model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    n_estimators=250,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.9,
    random_state=42,
    tree_method='hist'
)

# === Full Pipeline ===
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# === Training ===
pipeline.fit(X_train, y_train)

# === Evaluation ===
y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"🔍 Accuracy: {acc*100:.2f}%\n")
print("🔍 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# === Save Model ===
joblib.dump(pipeline, 'newpriority_predictor.pkl')
joblib.dump(label_encoder, 'newpriority_label_encoder.pkl')


🔍 Accuracy: 100.00%

🔍 Classification Report:

              precision    recall  f1-score   support

        High       1.00      1.00      1.00       495
         Low       1.00      1.00      1.00       207
      Medium       1.00      1.00      1.00       298

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



['newpriority_label_encoder.pkl']

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib

# ---------------- Load and preprocess training data ----------------
df = pd.read_csv('final_complaints.csv') 
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Save original filing date for optional later use
df['original_filing_date'] = df['filing_date']

# Extract datetime features
df['filing_year'] = df['filing_date'].dt.year
df['filing_month'] = df['filing_date'].dt.month
df['filing_day'] = df['filing_date'].dt.day

# Drop original filing_date
df.drop('filing_date', axis=1, inplace=True)

# One-hot encode categorical features
df = pd.get_dummies(df, drop_first=True)

# Split features and target
X = df.drop(['resolved_days_new', 'original_filing_date'], axis=1)
y = df['resolved_days_new']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")

# Save model
joblib.dump(model, 'resolution_time_predictorr.pkl')
joblib.dump(scaler, 'scalerr.pkl')  # Save the scaler too for consistent preprocessing

# ---------------- Predict on New Data ----------------
new_data = pd.read_csv('new_data.csv') 
new_data['filing_date'] = pd.to_datetime(new_data['filing_date'])

# Keep original date for output
original_filing_dates = new_data['filing_date'].copy()

# Extract date parts
new_data['filing_year'] = new_data['filing_date'].dt.year
new_data['filing_month'] = new_data['filing_date'].dt.month
new_data['filing_day'] = new_data['filing_date'].dt.day

# Drop original date
new_data.drop('filing_date', axis=1, inplace=True)

# One-hot encode new data
new_data = pd.get_dummies(new_data, drop_first=True)

# Align new data with training data
new_data = new_data.reindex(columns=X.columns, fill_value=0)

# Load saved scaler (if needed in separate script)
# scaler = joblib.load('scaler.pkl')

# Scale new data
new_data_scaled = scaler.transform(new_data)

# Predict
predicted_times = model.predict(new_data_scaled)

# Show predictions with expected resolution date
print("\n--- Predicted Resolution Dates ---")
for i in range(min(10, len(predicted_times))):
    filing_date = original_filing_dates.iloc[i]
    expected_resolution = filing_date + pd.to_timedelta(predicted_times[i], unit='D')
    print(f"Row {i+1}:")
    print(f"  Predicted Days: {predicted_times[i]:.2f}")
    print(f"  Filing Date: {filing_date.strftime('%d %B %Y')}")
    print(f"  Expected Completion Date: {expected_resolution.strftime('%d %B %Y')}\n")


Mean Absolute Error: 1.51


FileNotFoundError: [Errno 2] No such file or directory: 'new_data.csv'

In [7]:
import sklearn
import numpy as np
import scipy
import joblib
import pandas as pd
import matplotlib
import seaborn as sns
import streamlit as st

import xgboost as xgb
import lightgbm as lgb



# Print the version of each library
print(f"scikit-learn version: {sklearn.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {scipy.__version__}")
print(f"joblib version: {joblib.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"matplotlib version: {matplotlib.__version__}")
print(f"seaborn version: {sns.__version__}")
print(f"streamlit version: {st.__version__}")

print(f"xgboost version: {xgb.__version__}")
print(f"lightgbm version: {lgb.__version__}")



scikit-learn version: 1.6.1
numpy version: 1.26.4
scipy version: 1.15.2
joblib version: 1.4.2
pandas version: 2.2.2
matplotlib version: 3.10.1
seaborn version: 0.13.2
streamlit version: 1.44.1
xgboost version: 3.0.0
lightgbm version: 4.6.0


In [8]:
import streamlit as st

# Print the version of Streamlit
st.write("Streamlit version:", st.__version__)


2025-04-15 18:10:19.815 
  command:

    streamlit run C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
