In [1]:
# load top features from pickel file
import pickle

top_features = pickle.load(open('top_features.pkl', 'rb'))

In [13]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# load cleaned data
data = pd.read_csv('cleaned_data.csv')

# bootstrap sample with z-score normalization


def bootstrap_features(data, target_col='SCORE_AR', n_samples=1000):
    # Separate features and target
    features = data.drop(columns=[target_col])
    target = data[target_col]
    
    # Bootstrap features
    feature_indices = resample(
        np.arange(len(features)), 
        n_samples=n_samples,
        replace=True,
        random_state=42
    )
    
    # Create bootstrapped dataset
    bootstrapped_features = features.iloc[feature_indices]
    bootstrapped_target = target.iloc[feature_indices]
    
    # Combine features and target
    bootstrapped_data = pd.concat([bootstrapped_features, bootstrapped_target], axis=1)
    
    return bootstrapped_data

# Apply bootstrapping
bootstrapped_data = bootstrap_features(data, target_col='SCORE_AR', n_samples=1000)

# Save bootstrapped data
bootstrapped_data.to_csv('bootstrapped_data.csv', index=False)

# calculate the mean of the top features
top_feature_means = bootstrapped_data[top_features].mean()

# display the top features and their mean values in dataframe
top_feature_means_df = pd.DataFrame(top_feature_means, columns=['Mean'])
top_feature_means_df

Unnamed: 0,Mean
PROJECT_AREAS_ENERGY_SUSTAINABILITY,28.946
KEYNOTE_SPEAKER_INVITED,219.453
PARTNERSHIP_COUNTRY_N_A,79.954
PARTNERSHIP_NATIONAL,48.894
PROJECT_AREAS_BIOMEDICAL_AND_HEALTHCARE_ENGINEERING,32.69
PROJECT_AREAS_ENERGY_SECURITY,35.708
PROJECT_AREAS_NATURAL_PRODUCTS_BIOREFINERY_AND_BIOTECHNOLOGY,44.638
PROJECT_AREAS_SMART_LIVING_AND_SUSTAINABLE_CITIES,71.014
PROJECT_AREAS_SMART_MANUFACTURING_AND_MATERIALS,62.194
PROJECT_AREAS_SUSTAINABLE_AND_RESILIENT_URBANISATION,73.512


In [11]:
# Training a model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Continue with model training using bootstrapped data
X = bootstrapped_data[top_features]
y = bootstrapped_data['SCORE_AR']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and calculate mean squared error
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.4108619517696752e-27


In [12]:
# Increase each feature in the dataset by 20%
X_increased = X * 1.2

# Predict the SCORE_AR for the modified dataset
y_pred_increased = model.predict(X_increased)

# Display the predictions
print(y_pred_increased)

[46.31  47.767 44.954 47.767 47.767 43.443 44.954 44.954 44.954 47.767
 46.31  44.954 47.767 43.443 46.31  43.443 46.31  47.767 39.4   46.31
 43.443 47.767 46.31  39.4   39.4   44.954 44.954 43.443 46.31  46.31
 44.954 46.31  46.31  39.4   44.954 47.767 44.954 47.767 39.4   43.443
 46.31  39.4   46.31  43.443 43.443 39.4   43.443 47.767 43.443 46.31
 46.31  46.31  46.31  47.767 44.954 39.4   46.31  43.443 46.31  43.443
 43.443 46.31  47.767 43.443 43.443 46.31  43.443 43.443 46.31  46.31
 39.4   47.767 47.767 43.443 47.767 43.443 39.4   46.31  46.31  46.31
 47.767 39.4   47.767 47.767 39.4   39.4   39.4   39.4   46.31  44.954
 44.954 39.4   44.954 44.954 39.4   44.954 47.767 43.443 43.443 39.4
 46.31  39.4   46.31  43.443 39.4   47.767 44.954 46.31  44.954 44.954
 39.4   44.954 47.767 44.954 39.4   47.767 43.443 44.954 39.4   43.443
 43.443 46.31  47.767 44.954 39.4   46.31  47.767 46.31  47.767 47.767
 44.954 47.767 46.31  47.767 44.954 44.954 46.31  43.443 43.443 47.767
 39.4   47.76

In [24]:
from scipy.optimize import minimize
import numpy as np
import pandas as pd

def predict_inverse(model, desired_score, top_features, feature_bounds=None, max_iter=2000):
    """
    Predict feature values for desired score using optimization
    """
    initial_guess = bootstrapped_data[top_features].mean().values
    
    if feature_bounds is None:
        feature_bounds = []
        for feature in top_features:
            min_val = bootstrapped_data[feature].min()
            max_val = bootstrapped_data[feature].max() * 100
            feature_bounds.append((min_val, max_val))
    
    def objective(x):
        x_df = pd.DataFrame([x], columns=top_features)
        pred = model.predict(x_df)[0]
        return abs(pred - desired_score)  # Changed to absolute difference
    
    best_result = None
    best_score = float('inf')
    
    # Try multiple optimization methods
    methods = ['L-BFGS-B', 'SLSQP', 'Nelder-Mead']
    
    # Try more starting points
    multipliers = [0.5, 1.0, 1.5, 2.0, 5.0, 10.0]
    starting_points = [initial_guess * m for m in multipliers]
    
    for method in methods:
        for start in starting_points:
            try:
                result = minimize(
                    objective,
                    start,
                    bounds=feature_bounds if method != 'Nelder-Mead' else None,
                    method=method,
                    options={
                        'maxiter': max_iter,
                        'ftol': 1e-8,
                        'gtol': 1e-8
                    }
                )
                
                if result.fun < best_score:
                    best_score = result.fun
                    best_result = result
                    
                # Early stop if we're close enough
                if best_score < 0.1:
                    break
                    
            except Exception as e:
                print(f"Method {method} failed: {str(e)}")
                continue
    
    optimized_values = pd.DataFrame([best_result.x], columns=top_features)
    predicted = model.predict(optimized_values)[0]
    
    print(f"Best optimization method achieved error: {best_score:.4f}")
    print(f"Optimization status: {best_result.message}")
    
    return optimized_values, predicted

# Example usage
desired_score = 50
feature_values, predicted_score = predict_inverse(model, desired_score, top_features)

print(f"Desired Score: {desired_score}")
print(f"Achieved Score: {predicted_score:.2f}")
print(f"Optimization Success: {abs(desired_score - predicted_score) < 1}")
display(feature_values)

  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(
  result = minimize(


Best optimization method achieved error: 3.0810
Optimization status: CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
Desired Score: 50
Achieved Score: 46.92
Optimization Success: False


Feature,PROJECT_AREAS_ENERGY_SUSTAINABILITY,KEYNOTE_SPEAKER_INVITED,PARTNERSHIP_COUNTRY_N_A,PARTNERSHIP_NATIONAL,PROJECT_AREAS_BIOMEDICAL_AND_HEALTHCARE_ENGINEERING,PROJECT_AREAS_ENERGY_SECURITY,PROJECT_AREAS_NATURAL_PRODUCTS_BIOREFINERY_AND_BIOTECHNOLOGY,PROJECT_AREAS_SMART_LIVING_AND_SUSTAINABLE_CITIES,PROJECT_AREAS_SMART_MANUFACTURING_AND_MATERIALS,PROJECT_AREAS_SUSTAINABLE_AND_RESILIENT_URBANISATION,SCHOLARSHIP_PHD,SCHOLARSHIP_MASTER,CONFERENCE_NATIONAL_INCLUDE_UNIVERSITY_LEVEL,PAPER_INCENTIVE_NO_OF_PAPER_STAFF,SCORE_OVERALL_RANK
0,43.419,329.1795,119.931,73.341,49.035,53.562,66.957,106.521,93.291,110.268,109.38,28.77,66.996,1008.42,284.673
