In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from datetime import timedelta

# Example data loading (replace with your actual data)
data = pd.read_csv('final_complaints.csv')

# Convert categorical features to numerical using one-hot encoding
encoder = OneHotEncoder(sparse_output=False)  # Corrected parameter name
encoded_features = encoder.fit_transform(data[['area', 'type', 'department']])

# Convert the encoded features back to a DataFrame for easier concatenation
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['area', 'type', 'department']))

# Combine the encoded features with the original numerical features
X = pd.concat([data[['filing_date', 'predicted_priority']], encoded_df], axis=1)

# Convert `filing_date` into numerical features (e.g., day of the year, month, year)
X['filing_date'] = pd.to_datetime(data['filing_date'])
X['day_of_year'] = X['filing_date'].dt.dayofyear
X['month'] = X['filing_date'].dt.month
X['year'] = X['filing_date'].dt.year

# Drop the original 'filing_date' column as it’s no longer needed
X = X.drop('filing_date', axis=1)

# Target variable: resolved_days_new (resolution time)
y = data['resolved_days_new']

# Ensure column names are all strings
X.columns = X.columns.astype(str)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Now, predict for new data
new_data = pd.DataFrame({
    'filing_date': ['2025-04-01'],
    'predicted_priority': [1],  # example priority: 1 for high
    'area': ['Hadapsar'],
    'type': ['Technical'],
    'department': ['IT']
})

# Convert `filing_date` to features
new_data['filing_date'] = pd.to_datetime(new_data['filing_date'])
new_data['day_of_year'] = new_data['filing_date'].dt.dayofyear
new_data['month'] = new_data['filing_date'].dt.month
new_data['year'] = new_data['filing_date'].dt.year

# Drop the original 'filing_date' column as it’s no longer needed
new_data = new_data.drop('filing_date', axis=1)

# One-hot encode categorical features
encoded_new_data = encoder.transform(new_data[['area', 'type', 'department']])

# Combine the new data into the same format as the training data
X_new = pd.concat([new_data[['predicted_priority']], pd.DataFrame(encoded_new_data)], axis=1)

# Predict resolution time
predicted_resolution_time = model.predict(X_new)[0]
print(f"Predicted Resolution Time: {predicted_resolution_time} days")

# Calculate expected completion date
resolution_date = new_data['filing_date'].iloc[0] + timedelta(days=predicted_resolution_time)
print(f"Expected Completion Date: {resolution_date.strftime('%d %B %Y')}")


ValueError: could not convert string to float: 'Medium'

In [8]:
from sklearn.preprocessing import LabelEncoder

# Apply LabelEncoder to the categorical columns
label_encoder = LabelEncoder()

# Assuming 'Category' is the column with categorical data
data['predicted_priority'] = label_encoder.fit_transform(data['predicted_priority'])

# Now you can proceed with your model training
X = data.drop('resolved_days_new', axis=1)  # Assuming 'target' is the column you're predicting
y = data['resolved_days_new']

# Proceed with splitting data and training the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


ImportError: cannot import name 'UnsetMetadataPassedError' from 'sklearn.exceptions' (C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\exceptions.py)

In [4]:
print(X_train.dtypes)  # This will show the data types of all columns


NameError: name 'X_train' is not defined

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib

# Load the dataset (Assuming df is your DataFrame)
# df = pd.read_csv('your_data.csv')  # Uncomment to load your dataset if you have a file

# Sample DataFrame for illustration (replace with your actual data)
data = {
    'area': ['Area1', 'Area2', 'Area1', 'Area3'],
    'type': ['Type1', 'Type2', 'Type1', 'Type3'],
    'filing_date': ['2022-01-15', '2023-03-20', '2021-06-25', '2024-07-11'],
    'department': ['HR', 'Finance', 'Engineering', 'Marketing'],
    'predicted_priority': [1, 2, 3, 1],
    'resolved_days_new': [12, 15, 8, 14]
}

df = pd.DataFrame(data)

# Convert 'filing_date' to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Extract features from the 'filing_date' (e.g., day, month, year)
df['filing_year'] = df['filing_date'].dt.year
df['filing_month'] = df['filing_date'].dt.month
df['filing_day'] = df['filing_date'].dt.day

# Drop the original 'filing_date' column as it's no longer needed
df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding for categorical columns
df = pd.get_dummies(df, drop_first=True)

# Features (X) and target (y)
X = df.drop('resolved_days_new', axis=1)
y = df['resolved_days_new']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Standardize features (not necessary for RandomForest but might help with other models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the model as a .pkl file
joblib.dump(model, 'resolution_time_predictor.pkl')

# Example of predicting on new data:
new_data = {
    'area_Area2': 1,  # Assuming new data has one hot encoding already applied
    'type_Type3': 0,
    'department_Engineering': 1,
    'predicted_priority': 2,
    'filing_year': 2023,
    'filing_month': 4,
    'filing_day': 20
}

# Convert the new data into the same format (as if it was one-hot encoded)
new_data_df = pd.DataFrame([new_data])

# Standardize the new data if you used scaling (standardize new data similarly)
new_data_scaled = scaler.transform(new_data_df)

# Make prediction

predicted_resolution_time = model.predict(new_data_scaled)

# Example output
print(f"Predicted Resolution Time: {predicted_resolution_time[0]} days")

# Calculate expected completion date (add predicted resolution time to filing date)
filing_date = pd.to_datetime("2023-04-20")  # Example filing date
expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_time[0], unit='D')
print(f"Expected Completion Date: {expected_completion_date.strftime('%d %B %Y')}")


ImportError: cannot import name '_fit_context' from 'sklearn.base' (C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py)

In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib


df = pd.read_csv('final_complaints.csv') 
# Convert 'filing_date' to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Extract features from the 'filing_date' (e.g., day, month, year)
df['filing_year'] = df['filing_date'].dt.year
df['filing_month'] = df['filing_date'].dt.month
df['filing_day'] = df['filing_date'].dt.day

# Drop the original 'filing_date' column as it's no longer needed
df.drop('filing_date', axis=1, inplace=True)


# One-hot encoding for categorical columns
df = pd.get_dummies(df, drop_first=True)

# Features (X) and target (y)
X = df.drop('resolved_days_new', axis=1)
y = df['resolved_days_new']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Standardize features (not necessary for RandomForest but might help with other models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the model as a .pkl file
joblib.dump(model, 'resolution_time_predictor.pkl')

# To make predictions for new data from a CSV file:
# Assuming new data for prediction is stored in another CSV file, load that file
new_data_df = pd.read_csv('new_data.csv')  # Replace 'new_data.csv' with your new data CSV file path

# Convert 'filing_date' to datetime for new data
new_data_df['filing_date'] = pd.to_datetime(new_data_df['filing_date'])

# Extract features from the 'filing_date' (e.g., day, month, year) for new data
new_data_df['filing_year'] = new_data_df['filing_date'].dt.year
new_data_df['filing_month'] = new_data_df['filing_date'].dt.month
new_data_df['filing_day'] = new_data_df['filing_date'].dt.day

# Drop the original 'filing_date' column from new data
new_data_df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding for categorical columns in new data
new_data_df = pd.get_dummies(new_data_df, drop_first=True)

# Make sure the columns in the new data match the columns in the training data
new_data_df = new_data_df.reindex(columns=X.columns, fill_value=0)

# Standardize the new data using the same scaler
new_data_scaled = scaler.transform(new_data_df)

# Make predictions on the new data
predicted_resolution_times = model.predict(new_data_scaled)

# Display a subset of the predictions (e.g., the first 10 rows or a random sample)
num_display = 10  # Adjust this number as needed

# Display the first 'num_display' predictions
for i in range(num_display):
    filing_date = pd.to_datetime(new_data_df['filing_date'].iloc[i])  # Get the corresponding filing date
    expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
    
    print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
    print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")

# Optional: For a random sample, use this code instead of the loop above
# random_indices = np.random.choice(new_data_scaled.shape[0], size=num_display, replace=False)
# for i in random_indices:
#     filing_date = pd.to_datetime(new_data_df['filing_date'].iloc[i])
#     expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
#     print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
#     print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")


ImportError: cannot import name '_fit_context' from 'sklearn.base' (C:\Users\SAMYAK KHANDERAO\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import joblib

# Load the dataset from CSV
df = pd.read_csv('your_data.csv')  # Replace 'your_data.csv' with the path to your CSV file

# Convert 'filing_date' to datetime
df['filing_date'] = pd.to_datetime(df['filing_date'])

# Extract features from the 'filing_date' (e.g., day, month, year)
df['filing_year'] = df['filing_date'].dt.year
df['filing_month'] = df['filing_date'].dt.month
df['filing_day'] = df['filing_date'].dt.day

# Drop the original 'filing_date' column as it's no longer needed
df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding for categorical columns
df = pd.get_dummies(df, drop_first=True)

# Features (X) and target (y)
X = df.drop('resolved_days_new', axis=1)
y = df['resolved_days_new']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Standardize features (not necessary for RandomForest but might help with other models)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Save the model as a .pkl file
joblib.dump(model, 'resolution_time_predictor.pkl')

# To make predictions for new data from a CSV file:
# Assuming new data for prediction is stored in another CSV file, load that file
new_data_df = pd.read_csv('new_data.csv')  # Replace 'new_data.csv' with your new data CSV file path

# Convert 'filing_date' to datetime for new data
new_data_df['filing_date'] = pd.to_datetime(new_data_df['filing_date'])

# Extract features from the 'filing_date' (e.g., day, month, year) for new data
new_data_df['filing_year'] = new_data_df['filing_date'].dt.year
new_data_df['filing_month'] = new_data_df['filing_date'].dt.month
new_data_df['filing_day'] = new_data_df['filing_date'].dt.day

# Drop the original 'filing_date' column from new data
new_data_df.drop('filing_date', axis=1, inplace=True)

# One-hot encoding for categorical columns in new data
new_data_df = pd.get_dummies(new_data_df, drop_first=True)

# Make sure the columns in the new data match the columns in the training data
new_data_df = new_data_df.reindex(columns=X.columns, fill_value=0)

# Standardize the new data using the same scaler
new_data_scaled = scaler.transform(new_data_df)

# Make predictions on the new data
predicted_resolution_times = model.predict(new_data_scaled)

# Display a subset of the predictions (e.g., the first 10 rows or a random sample)
num_display = 10  # Adjust this number as needed

# Debugging: Check if the predictions are made correctly
if len(predicted_resolution_times) == len(new_data_scaled):
    print(f"Predictions are successfully made for {len(predicted_resolution_times)} rows.")
else:
    print(f"Error: Predictions count does not match the number of rows in new data.")

# Display the first 'num_display' predictions
for i in range(num_display):
    filing_date = pd.to_datetime(new_data_df['filing_date'].iloc[i])  # Get the corresponding filing date
    expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
    
    print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
    print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")

# Optional: For a random sample, use this code instead of the loop above
# random_indices = np.random.choice(new_data_scaled.shape[0], size=num_display, replace=False)
# for i in random_indices:
#     filing_date = pd.to_datetime(new_data_df['filing_date'].iloc[i])
#     expected_completion_date = filing_date + pd.to_timedelta(predicted_resolution_times[i], unit='D')
#     print(f"Predicted Resolution Time for row {i+1}: {predicted_resolution_times[i]:.2f} days")
#     print(f"Expected Completion Date for row {i+1}: {expected_completion_date.strftime('%d %B %Y')}\n")


In [1]:
import sklearn
print(sklearn.__version__)
 

1.2.2


In [7]:
python -c "import sklearn; print(sklearn.__version__)"


SyntaxError: invalid syntax (1478574462.py, line 1)

In [1]:
import numpy
import pandas

print("NumPy version:", numpy.__version__)
print("Pandas version:", pandas.__version__)


NumPy version: 1.26.4
Pandas version: 2.2.2


In [2]:
import pandas as pd
import numpy as np
import sklearn
import joblib

print("Versions of Libraries Used:")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Joblib version: {joblib.__version__}")


Versions of Libraries Used:
Pandas version: 2.2.2
Numpy version: 1.26.4
Scikit-learn version: 1.6.1
Joblib version: 1.4.2
