In [319]:
pip install pymupdf



In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import google.generativeai as genai
import json
import fitz 

In [3]:
genai.configure(api_key="AIzaSyBYQ5SgfI8mtSnJftWmZduop8TTuDpdzWQ")

In [4]:
df = pd.read_csv("project_plans.csv")
df_prioritized = df.copy()  # Create a copy for prioritization

In [5]:
category_weightage = {
    "Healthcare": 10,
    "Infrastructure": 9,
    "Education": 8,
    "Water & Sanitation": 8,
    "Energy": 7,
    "Transport": 6,
    "Environment": 5,
    "Social Welfare": 4,
    "Tourism": 3,
    "IT & Digital Services": 3
}

In [6]:
df_prioritized["Category_Weight"] = df_prioritized["Category"].map(category_weightage).fillna(0)

In [7]:
scaler = MinMaxScaler()
df_prioritized["Duration_Score"] = 1 - scaler.fit_transform(df_prioritized[["Duration"]].copy())  # Shorter duration gets higher weight
df_prioritized["Cost_Score"] = 1 - scaler.fit_transform(df_prioritized[["Estimated_Cost (INR)"]].copy())

In [8]:
label_encoder = LabelEncoder()
df_prioritized["Category_Encoded"] = label_encoder.fit_transform(df_prioritized["Category"])

In [9]:
df_prioritized["Priority_Score"] = df_prioritized["Category_Weight"] * 0.5 + df_prioritized["Duration_Score"] * 0.25 + df_prioritized["Cost_Score"] * 0.25

In [10]:
features = df_prioritized[["Category_Encoded", "Estimated_Cost (INR)", "Duration"]]
target = df_prioritized["Priority_Score"]

In [11]:
features = df_prioritized[["Category_Encoded", "Estimated_Cost (INR)", "Duration"]]
target = df_prioritized["Priority_Score"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)



In [12]:
df_prioritized["Priority_Score"] = df_prioritized["Category_Weight"] * 0.4 + df_prioritized["Duration_Score"] * 0.3 + df_prioritized["Cost_Score"] * 0.3

In [13]:
features = df_prioritized[["Category_Encoded", "Estimated_Cost (INR)", "Duration"]]
target = df_prioritized["Priority_Score"]

In [14]:
df_prioritized.to_csv("prioritized_project_plans.csv", index=False)

In [15]:
model_trained = False  # Initialize the flag
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train.fillna(X_train.mean()), y_train.fillna(y_train.mean()))
model_trained = True  # Set the flag to True after training

In [16]:
from sklearn.metrics import mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100  

# Make predictions
y_pred = model.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

Mean Absolute Error (MAE): 0.6573
Mean Absolute Percentage Error (MAPE): 23.34%


In [17]:

def predict_priority(category, estimated_cost, duration):
            if not model_trained:
              raise ValueError("Model is not trained yet.")
            
            category_encoded = label_encoder.transform([category])[0] if category in label_encoder.classes_ else 0
            
            input_data = pd.DataFrame([[category_encoded, estimated_cost, duration]], 
                          columns=["Category_Encoded", "Estimated_Cost (INR)", "Duration"])
            
            predicted_priority = model.predict(input_data)[0]
            return predicted_priority

In [18]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)  # Using fitz instead of pymupdf
        text = "\n".join([page.get_text("text") for page in doc])
        return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

In [19]:
import re

def extract_project_details(report_text):
    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"""Extract the following details from the given project report:
    - Project_Name
    - Category
    - Estimated_Cost (INR)
    - Start_Year
    - End_Year
    - Duration (Years)
    
    Report: {report_text}
    
    Provide the response in **valid** JSON format, with no extra explanations."""
    
    response = model.generate_content(prompt)
    print("Raw Gemini API response:", response.text)  # Debugging

    try:
        # Extract JSON from response using regex
        json_text = re.search(r"\{.*\}", response.text, re.DOTALL)
        if json_text:
            return json.loads(json_text.group())  # Parse valid JSON
        else:
            print("No JSON detected in response.")
            return {}
    except json.JSONDecodeError:
        print("Error parsing response from Gemini API.")
        return {}


In [20]:
def add_project_from_pdf(pdf_path):
    global df, df_prioritized  # Ensure we modify the global dataframes
    report_text = extract_text_from_pdf(pdf_path)
    if not report_text:
        print("No text extracted from PDF.")
        return

    project_details = extract_project_details(report_text)
    
    if not project_details:
        print("No valid project details extracted.")
        return

    category = project_details.get("Category", "Unknown")
    estimated_cost = project_details.get("Estimated_Cost (INR)", 0)
    duration = project_details.get("Duration (Years)", 0)

    predicted_priority = predict_priority(category, estimated_cost, duration)

    new_row = {
        "Project_Name": project_details.get("Project_Name", "Unknown"),
        "Category": category,
        "Estimated_Cost (INR)": estimated_cost,
        "Start_Year": project_details.get("Start_Year", 0),
        "End_Year": project_details.get("End_Year", 0),
        "Duration": duration
    }

    # Add new row to project_plans.csv
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)  # Fix for Pandas 2.0+
    df.to_csv("project_plans.csv", index=False)  # Save updated project plans CSV

    # Add new row to prioritized_project_plans.csv with priority score
    new_row_prioritized = new_row.copy()
    new_row_prioritized["Priority_Score"] = predicted_priority
    df_prioritized = pd.concat([df_prioritized, pd.DataFrame([new_row_prioritized])], ignore_index=True)  # Fix for Pandas 2.0+
    df_prioritized = df_prioritized.sort_values(by="Priority_Score", ascending=False)
    df_prioritized.to_csv("prioritized_project_plans.csv", index=False)  # Save updated prioritized CSV

    # Print the score and rank of the new project
    new_project_rank = df_prioritized.reset_index().index[df_prioritized["Project_Name"] == new_row["Project_Name"]][0] + 1
    new_project_score = df_prioritized[df_prioritized["Project_Name"] == new_row["Project_Name"]]["Priority_Score"].values[0]
    print(f"New project added successfully! Its score is: {new_project_score} and its rank in the priority list is: {new_project_rank}")

In [21]:
df_prioritized = df_prioritized.sort_values(by="Priority_Score", ascending=False)
df_prioritized.to_csv("prioritized_project_plans.csv", index=False)

In [22]:
print(df.head(10))

  Project_ID                       Project_Name            Category  \
0       P001           Rural Ambulance Services          Healthcare   
1       P002  Drinking Water Purification Units  Water & Sanitation   
2       P003           Village Smart Classrooms           Education   
3       P004     Rural Health Awareness Program          Healthcare   
4       P005         Water Supply Line Upgrades      Infrastructure   
5       P006    Road Safety Improvement Project      Infrastructure   
6       P007          Public Library Renovation           Education   
7       P008      Community Toilet Construction  Water & Sanitation   
8       P009             Bus Stop Modernization           Transport   
9       P010              Solar Street Lighting              Energy   

   Estimated_Cost (INR)  Start_Year  End_Year  Duration  
0            2000000000        2024      2026         2  
1            1800000000        2024      2028         4  
2            2200000000        2025      202

In [24]:
add_project_from_pdf(r"pdf\\tourism.pdf") 

Raw Gemini API response: ```json
{
  "Project_Name": "Heritage Tourism Promotion",
  "Category": "Tourism",
  "Estimated_Cost (INR)": 1000000,
  "Start_Year": 2025,
  "End_Year": 2026,
  "Duration (Years)": 1
}
```

New project added successfully! Its score is: 3.013823766526927 and its rank in the priority list is: 23
