In [None]:
!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
# Load the data
data = pd.read_csv("EngineeringRanking_Final.csv")

In [None]:
# Handling missing values by filling with 0
data.fillna(0, inplace=True)

In [None]:
# Remove rows with missing target variable (Rank_21)
data.dropna(subset=['Rank_21'], inplace=True)

In [None]:
# Feature selection
features = ['Score_21', 'Rank_21', 'TLR_21', 'RPC_21', 'GO_21', 'OI_21', 'Perception_21']
X = data[features]
y = data['Rank_21']  # Target variable for prediction

In [None]:
# Splitting data into train, test, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the model
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
# Get predictions for 2023 data
data.dropna(subset=['Rank_21'], inplace=True)  # Remove rows with missing target values
X_val = data[features]
y_val_true = data['Rank_21']
y_val_pred = model.predict(X_val)
data['Predicted_Rank_2023'] = y_val_pred
sorted_data_2023 = data.sort_values(by='Predicted_Rank_2023')

In [None]:
# Displaying the predicted Rank_23 for colleges
Predicted_Rank_23 = sorted_data_2023[['Institute Name', 'City', 'State', 'Rank_21', 'Predicted_Rank_2023']]
print("\nPredicted Rank for 2023:")
print(Predicted_Rank_23)

In [None]:
#Accuracy:

# Calculate accuracy
threshold = 1  # Define the threshold for accurate predictions (e.g., within ±1 rank)

# Count the number of accurate predictions
accurate_predictions = np.sum(np.abs(y_val_true - y_val_pred) <= threshold)

# Total number of predictions
total_predictions = len(y_val_true)

# Calculate accuracy
accuracy = (accurate_predictions / total_predictions) * 100
print(f"Accuracy on Validation Set (2023 data): {accuracy:.2f}%")


In [None]:
# Evaluate the model on training set
y_train_pred = model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("Training Set Evaluation:")
print(f"Mean Squared Error on Training Set: {train_mse}")
print(f"Mean Absolute Error on Training Set: {train_mae}")
print(f"R-squared on Training Set: {train_r2}\n")

In [None]:
# Evaluate the model on validation set
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val_true, y_val_pred)
val_mae = mean_absolute_error(y_val_true, y_val_pred)
val_r2 = r2_score(y_val_true, y_val_pred)
print("\nValidation Set Evaluation:")
print(f"Mean Squared Error on Validation Set: {val_mse}")
print(f"Mean Absolute Error on Validation Set: {val_mae}")
print(f"R-squared on Validation Set: {val_r2}")

In [None]:
# Evaluate the model on testing set
y_test_pred = model.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print("\nTesting Set Evaluation:")
print(f"Mean Squared Error on Testing Set: {test_mse}")
print(f"Mean Absolute Error on Testing Set: {test_mae}")
print(f"R-squared on Testing Set: {test_r2}")

In [None]:
# Check for overfitting or underfitting
if train_r2 > val_r2:
    print("\nModel is overfitting (training R-squared > validation R-squared)")
elif train_r2 < val_r2:
    print("\nModel is underfitting (training R-squared < validation R-squared)")
else:
    print("\nModel is performing well (training R-squared == validation R-squared)")

In [None]:
# DataFrame to compare actual and predicted ranks
comparison_df = pd.DataFrame({
    'Institute Name': data['Institute Name'],
    'City': data['City'],
    'State': data['State'],
    'Actual Rank 2023': y_val_true,
    'Predicted Rank 2023': y_val_pred.round().astype(int)  # Round predicted ranks to the nearest integer
})

# Display the comparison DataFrame
print("Comparison of Actual and Predicted Ranks for 2023:")
print(comparison_df)


# CLUSTERING ALGORITHM

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("EngineeringRanking_Final.csv")
data.fillna(0, inplace=True)
data.tail()

In [None]:
def display_city_list(state):
    cities = data[data['State'] == state]['City'].unique()
    print("\nCities in", state, "are:")
    for city in cities:
        print(city)

def display_state_list():
    states = data['State'].unique()
    print("States with institutes are:")
    for state in states:
        print(state)
        
def display_institutes_by_city(city):
    city_data = data[data['City'] == city]
    if len(city_data) == 0:
        print("No institutes found in", city)
        return
    print("\nInstitutes in", city, "from 2016 to 2021:")
    print('-' * 150)
    print("{:<70} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
        "Institute Name", "2016", "2017", "2018", "2019", "2020", "2021"))
    print('-' * 150)
    for index, row in city_data.iterrows():
        print("{:<70} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
            row['Institute Name'],
            row['Rank_16'], row['Rank_17'], row['Rank_18'], row['Rank_19'], row['Rank_20'], row['Rank_21']
        ))
    print('-' * 150)


# Main program
display_state_list()
state = input("Enter state: ")
display_city_list(state)
city = input("Enter City: ")
display_institutes_by_city(city)

# Prediction of Rank_24

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Load the data into a pandas DataFrame
data = pd.read_csv("EngineeringRanking_Final.csv")

In [None]:
# Replace all NaN values with 0
data.fillna(0, inplace=True)

In [None]:
# Reshape the data into a time series format
time_series_data = data.melt(id_vars=['Institute Id', 'Institute Name', 'City', 'State'],
                             value_vars=['Rank_23', 'Score_23', 'TLR_23', 'RPC_23', 'GO_23', 'OI_23', 'Perception_23'],
                             var_name='Metric', value_name='Value')

In [None]:
# Group the data by institute and create a time series for each institute
grouped = time_series_data.groupby(['Institute Id', 'Institute Name', 'City', 'State', 'Metric'])

In [None]:
# Train an ARIMA model for each institute's time series
forecasts = []
for group_name, group in grouped:
    institute_id, institute_name, city, state, metric = group_name
    if metric == 'Rank':
        model = ARIMA(group['Value'], order=(1, 1, 1))
        model_fit = model.fit()
        forecast = model_fit.forecast(steps=1)
        forecasts.append([institute_id, institute_name, city, state, metric, forecast[0]])

In [None]:
# Alternatively, train a Random Forest Regression model
features = ['Score_23', 'TLR_23', 'RPC_23', 'GO_23', 'OI_23', 'Perception_23']
target = 'Rank_23'
X = data[features]
y = data[target]
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)
predictions = model.predict(X)

In [None]:
# Round the predictions to the nearest whole number
predictions_rounded = [round(pred) for pred in predictions]

In [None]:
# Combine the predictions for 2024
predictions_2024 = pd.DataFrame({'Institute Id': data['Institute Id'],
                                  'Institute Name': data['Institute Name'],
                                  'City': data['City'],
                                  'State': data['State'],
                                  'Predicted Rank 2024': predictions_rounded})

print(predictions_2024)


In [None]:
# Save predictions_2024 into a CSV file
predictions_2024.to_csv("Predictions of Rank_24.csv", index=False)