In [1]:
!pip install xgboost



In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Load the data
years = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
dataframes = []
for year in years:
    data = pd.read_csv(f"EngineeringRanking_{year}.csv")
    dataframes.append(data)
data = pd.concat(dataframes)

In [3]:
# Combine data with EngineeringRanking.csv
combined_data = pd.read_csv("EngineeringRanking.csv")
data = pd.concat([data, combined_data])

In [4]:
# Handling missing values by filling with 0
data.fillna(0, inplace=True)

In [5]:
# Convert non-numeric columns to numeric types
numeric_columns = ['Score', 'Rank', 'TLR', 'RPC', 'GO', 'OI', 'Perception']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [6]:
# Remove rows with missing target variable (Rank_21)
data.dropna(subset=['Rank_21'], inplace=True)

In [13]:
# Feature selection
features = ['Score_21', 'TLR_21', 'RPC_21', 'GO_21', 'OI_21', 'Perception_21']
X = data[features]
y = data['Rank_21']  # Target variable for prediction

In [14]:
# Splitting data into train, test, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Train the model
model = XGBRegressor()
model.fit(X_train, y_train)

In [10]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on Test Set: {mse}")

Mean Squared Error on Test Set: 975.6032179533722


In [12]:
# Print the column names of combined_data
print(combined_data.columns)

# Feature selection
features = ['Score', 'TLR', 'RPC', 'GO', 'OI', 'Perception']


Index(['Institute Id', 'Institute Name', 'City', 'State', 'Score_21',
       'Rank_21', 'TLR_21', 'RPC_21', 'GO_21', 'OI_21', 'Perception_21',
       'Score_20', 'Rank_20', 'TLR_20', 'RPC_20', 'GO_20', 'OI_20',
       'Perception_20', 'Score_19', 'Rank_19', 'TLR_19', 'RPC_19', 'GO_19',
       'OI_19', 'Perception_19', 'Score_18', 'Rank_18', 'TLR_18', 'RPC_18',
       'GO_18', 'OI_18', 'Perception_18', 'Score_17', 'Rank_17', 'TLR_17',
       'RPC_17', 'GO_17', 'OI_17', 'Perception_17', 'Score_16', 'Rank_16',
       'TLR_16', 'RPC_16', 'GO_16', 'OI_16', 'Perception_16'],
      dtype='object')


In [11]:
# Validation on 2023 data
combined_data.dropna(subset=['Rank_21'], inplace=True)  # Remove rows with missing target values
X_val = combined_data[features]
y_val_true = combined_data['Rank_21']
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val_true, y_val_pred)
print(f"Mean Squared Error on Validation Set (2023 data): {val_mse}")

KeyError: "None of [Index(['Score', 'TLR', 'RPC', 'GO', 'OI', 'Perception'], dtype='object')] are in the [columns]"

In [None]:
# Get predictions for 2023 data
combined_data['Predicted_Rank_2023'] = y_val_pred
sorted_data_2023 = combined_data.sort_values(by='Predicted_Rank_2023')

In [None]:
# Displaying the top 5 predicted colleges for 2023
top_5_colleges_2023 = sorted_data_2023[['Institute Name', 'City', 'State', 'Rank_21', 'Predicted_Rank_2023']].head(5)
print("\nTop 5 Predicted Colleges for 2023:")
print(top_5_colleges_2023)

In [None]:
#Accuracy:

# Calculate accuracy
threshold = 1  # Define the threshold for accurate predictions (e.g., within ±1 rank)

# Count the number of accurate predictions
accurate_predictions = np.sum(np.abs(y_val_true - y_val_pred) <= threshold)

# Total number of predictions
total_predictions = len(y_val_true)

# Calculate accuracy
accuracy = (accurate_predictions / total_predictions) * 100
print(f"Accuracy on Validation Set (2023 data): {accuracy:.2f}%")


In [16]:
# DataFrame to compare actual and predicted ranks
comparison_df = pd.DataFrame({
    'Institute Name': combined_data['Institute Name'],
    'City': combined_data['City'],
    'State': combined_data['State'],
    'Actual Rank 2023': y_val_true,
    'Predicted Rank 2023': y_val_pred.round().astype(int)  # Round predicted ranks to the nearest integer
})

# Display the comparison DataFrame
print("Comparison of Actual and Predicted Ranks for 2023:")
print(comparison_df)


Comparison of Actual and Predicted Ranks for 2023:
                                 Institute Name       City           State  \
0         Indian Institute of Technology Madras    Chennai      Tamil Nadu   
1          Indian Institute of Technology Delhi  New Delhi           Delhi   
2         Indian Institute of Technology Bombay     Mumbai     Maharashtra   
3         Indian Institute of Technology Kanpur     Kanpur   Uttar Pradesh   
4      Indian Institute of Technology Kharagpur  Kharagpur     West Bengal   
..                                          ...        ...             ...   
195       The National Institute of Engineering     Mysore       Karnataka   
196        K. J. Somaiya College of Engineering     Mumbai     Maharashtra   
197  Kakatiya Institute of Technology & Science   Warangal       Telangana   
198             Walchand College of Engineering     Sangli     Maharashtra   
199                 Sri Venkateswara University   Tirupati  Andhra Pradesh   

     Actual 