In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error, r2_score

In [10]:
'''
# Load the data
years = ['2016', '2017', '2018', '2019', '2020', '2021', '2022']
dataframes = []
for year in years:
    data = pd.read_csv(f"EngineeringRanking_{year}.csv")
    dataframes.append(data)
data = pd.concat(dataframes)
'''

'\n# Load the data\nyears = [\'2016\', \'2017\', \'2018\', \'2019\', \'2020\', \'2021\', \'2022\']\ndataframes = []\nfor year in years:\n    data = pd.read_csv(f"EngineeringRanking_{year}.csv")\n    dataframes.append(data)\ndata = pd.concat(dataframes)\n'

In [11]:
'''
# Combine data with EngineeringRanking.csv
combined_data = pd.read_csv("EngineeringRanking_Final.csv")
data = pd.concat([data, combined_data])
'''
data = pd.read_csv(f"EngineeringRanking_Final.csv")

In [12]:
# Handling missing values by filling with 0
data.fillna(0, inplace=True)

In [13]:
# Convert non-numeric columns to numeric types
numeric_columns = ['Score', 'Rank', 'TLR', 'RPC', 'GO', 'OI', 'Perception']
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

KeyError: "None of [Index(['Score', 'Rank', 'TLR', 'RPC', 'GO', 'OI', 'Perception'], dtype='object')] are in the [columns]"

In [7]:
# Remove rows with missing target variable (Rank_21)
data.dropna(subset=['Rank_21'], inplace=True)

In [8]:
# Feature selection
features = ['Score_22', 'TLR_22', 'RPC_22', 'GO_22', 'OI_22', 'Perception_22']
X = data[features]
y = data['Rank_22']  # Target variable for prediction

KeyError: 'Rank_22'

In [None]:
# Splitting data into train, test, and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
### Feature Scaling
'''
## Min-Max Scaling:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X)

'''
## Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
###Checking Test Data for Missing Values

# Concatenate X_test and y_test into a single DataFrame
test_data = pd.concat([X_test, y_test], axis=1)

# Checking for missing values in the entire test data
missing_values_test = test_data.isnull().sum()
print("Missing values in test data:")
print(missing_values_test)

In [None]:
# Train the model on scaled data
model = XGBRegressor()
model.fit(X_train_scaled, y_train)

In [None]:
# Validation on 2023 data
combined_data.dropna(subset=['Rank_22'], inplace=True)  # Remove rows with missing target values
X_val = combined_data[features]
y_val_true = combined_data['Rank_22']

# Feature scaling on validation data
X_val_scaled = scaler.transform(X_val)

In [None]:
# Evaluate the model on the training set
y_train_pred = model.predict(X_train_scaled)
train_mse = mean_squared_error(y_train, y_train_pred)
print(f"Mean Squared Error on Training Set: {train_mse}")

In [None]:
# Evaluate the model on the validation set
y_val_pred = model.predict(X_val_scaled)
val_mse = mean_squared_error(y_val_true, y_val_pred)
print(f"Mean Squared Error on Validation Set (2023 data): {val_mse}")

In [None]:
# Evaluate the model on the test set
y_test_pred = model.predict(X_test_scaled)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Mean Squared Error on Test Set: {test_mse}")

In [None]:
# Check for overfitting or underfitting
if train_mse < val_mse:
    print("The model might be overfitting.")
elif train_mse > val_mse:
    print("The model might be underfitting.")
else:
    print("The model is likely properly fitted.")

In [None]:
# Get predictions for 2023 data
combined_data['Predicted_Rank_2023'] = y_val_pred
sorted_data_2023 = combined_data.sort_values(by='Predicted_Rank_2023')

In [None]:
#Accuracy:

# Calculate accuracy
threshold = 1  # Define the threshold for accurate predictions (e.g., within ±1 rank)

# Count the number of accurate predictions
accurate_predictions = np.sum(np.abs(y_val_true - y_val_pred) <= threshold)

# Total number of predictions
total_predictions = len(y_val_true)

# Calculate accuracy
accuracy = (accurate_predictions / total_predictions) * 100
print(f"Accuracy on Validation Set (2023 data): {accuracy:.2f}%")


In [None]:
# DataFrame to compare actual and predicted ranks
comparison_df = pd.DataFrame({
    'Institute Name': combined_data['Institute Name'],
    'City': combined_data['City'],
    'State': combined_data['State'],
    'Actual Rank 2023': y_val_true,
    'Predicted Rank 2023': y_val_pred.round().astype(int)  # Round predicted ranks to the nearest integer})

# # Display the comparison DataFrame
print("Comparison of Actual and Predicted Ranks for 2023:")
print(comparison_df)


# CLUSTERING ALGORITHM

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("EngineeringRanking.csv")
data.fillna(0, inplace=True)
data.tail()

In [None]:
def display_city_list(state):
    cities = data[data['State'] == state]['City'].unique()
    print("\nCities in", state, "are:")
    for city in cities:
        print(city)

def display_state_list():
    states = data['State'].unique()
    print("States with institutes are:")
    for state in states:
        print(state)
        
def display_institutes_by_city(city):
    city_data = data[data['City'] == city]
    if len(city_data) == 0:
        print("No institutes found in", city)
        return
    print("\nInstitutes in", city, "from 2016 to 2021:")
    print('-' * 150)
    print("{:<70} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
        "Institute Name", "2016", "2017", "2018", "2019", "2020", "2021"))
    print('-' * 150)
    for index, row in city_data.iterrows():
        print("{:<70} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
            row['Institute Name'],
            row['Rank_16'], row['Rank_17'], row['Rank_18'], row['Rank_19'], row['Rank_20'], row['Rank_21']
        ))
    print('-' * 130)


# Main program
display_state_list()
state = input("Enter state: ")
display_city_list(state)
city = input("Enter City: ")
display_institutes_by_city(city)