<H1>DATA SCIENCE PROJECT</H1>

<h2>Dataset-MAX</h2>

<H2>Importing Modules</H2>

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.preprocessing import LabelEncoder
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, OneClassSVM
warnings.filterwarnings('ignore')
path = kagglehub.dataset_download("breadnbu22er/or-cr-2016-to-2024")

print("Path to dataset files:", path)

In [None]:
data_master=pd.read_csv('DataSet.csv') #reading the csv file
data_master.describe()
data_master.head()

<h2>Data Preprocessing</h2>

<h3>Replacing Missing Values with Max Values</h3>

In [None]:
data_max=pd.read_csv('DataSet-max.csv')
data_max.describe()

In [None]:
print("Missing values in dataset:\n", data_max.isnull().sum()) #to check how many values are null

In [None]:
data_max['Closing_Rank'] = pd.to_numeric(data_max['Closing_Rank'], errors='coerce')
data_max['Opening_Rank'] = pd.to_numeric(data_max['Opening_Rank'], errors='coerce')


max_closing_rank = data_max['Closing_Rank'].max()
max_opening_rank = data_max['Opening_Rank'].max()


data_max['Closing_Rank'].fillna(max_closing_rank, inplace=True) #replacing missing values with max values of the column
data_max['Opening_Rank'].fillna(max_opening_rank, inplace=True)


print("Missing values after Imputation:\n", data_max.isnull().sum())
data_max.dtypes        

<h1>Data Visualization</h1>

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(x='Institute', data=data_max, hue='Institute', palette='pastel', legend=False)
plt.xticks(rotation=90)
plt.xlabel('Institute Name')
plt.ylabel('Total seats')
plt.title('Seat Distribution among Institutes'); #creating the master plot

In [None]:
avg_ranks_per_institute = iit_df.groupby('Institute')[['Opening_Rank', 'Closing_Rank']].mean().reset_index()

# Plotting institute-wise average ranks
plt.figure(figsize=(12, 6))
sns.barplot(data=avg_ranks_per_institute, x='Institute', y='Opening_Rank', color='skyblue', label='Opening Rank')
sns.barplot(data=avg_ranks_per_institute, x='Institute', y='Closing_Rank', color='lightgreen', alpha=0.6, label='Closing Rank')
plt.xticks(rotation=90)
plt.title('Institute-Wise Average Opening and Closing Ranks')
plt.xlabel('Institute')
plt.ylabel('Average Rank')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.lineplot(x='Year', y='Closing_Rank', hue='Gender', data=iit_df, marker='s')
plt.title('Female vs Male Rank Distribution Over the Years')
plt.xlabel('Year')
plt.ylabel('Closing Rank')
plt.legend(title='Gender')
plt.show() #ploting Female vs Male Rank Distribution Over the Years

<h2>Label Encoding</h2>

In [None]:

label_encoders = {}   #label encoding the categorical columns
for column in ['Institute', 'Quota', 'Gender']:
    le = LabelEncoder()
    data_max[column] = le.fit_transform(data_max[column])
    label_encoders[column] = le

In [None]:
data_max['Rank'] = (data_max['Opening_Rank'] + data_max['Closing_Rank']) / 2 #combining the opening and closing rank column

In [None]:
X = data_max[['Rank', 'Quota', 'Gender']]
y = data_max['Institute']

<H2>Spliting the Dataset Into tranning and testing data</H2>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

<H2>Defining Models</H2>

In [None]:
classifier = RandomForestClassifier(random_state=1)
decision_tree = DecisionTreeClassifier(random_state=1)
svm_classifier = SVC(kernel='linear')
knn_classifier = KNeighborsClassifier(n_neighbors=5) 
nb_classifier = GaussianNB()
nn_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=1)

<h2>Tranning Random Forest Classification Model</h2>

In [None]:
classifier.fit(X_train, y_train)
y_pred_classifier = classifier.predict(X_test)
classifier_accuracy = accuracy_score(y_test, y_pred_classifier)
print(f"Classification Model Accuracy: {classifier_accuracy*10}")

<h2>Tranning Decision Tree Model</h2>

In [None]:
decision_tree.fit(X_train, y_train)
y_pred_decision_tree = decision_tree.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, y_pred_decision_tree)
print(f"Decision Tree Classifier Accuracy: {decision_tree_accuracy*10}")

<H2>Tranning SVM</H2>

In [None]:
sample_limit=500
if len(X_train) > sample_limit:
    X_train_sampled, _, y_train_sampled, _ = train_test_split(
        X_train, y_train, train_size=sample_limit, stratify=y_train, random_state=42
    )
else:
    # If data is smaller than limit, use all the training data
    X_train_sampled, y_train_sampled = X_train, y_train

svm_classifier.fit(X_train_sampled, y_train_sampled)
y_pred_svm = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Classifier Accuracy: {svm_accuracy}")

<h2>Tranning KNN Model</h2>

In [None]:
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"KNN Classifier Accuracy: {knn_accuracy*10}")

<h2>Naïve Bayes Classifier</h2>

In [None]:
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Classifier Accuracy: {nb_accuracy}")

<h2>Neural Network Based Classifier</h2>

In [None]:
nn_classifier.fit(X_train_sampled, y_train_sampled)
y_pred_nn = nn_classifier.predict(X_test)
nn_accuracy = accuracy_score(y_test, y_pred_nn)
print(f"Neural Network Classifier Accuracy: {nn_accuracy}")

<h2>One Class Classifier</h2>

In [None]:

inlier_class = 3  # Replacing with 3rd encoded value 
X_train_inliers = X_train[y_train == inlier_class]  # Filter training data for the chosen class

# Initialize and train the One-Class SVM
ocsvm_classifier = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto') 
ocsvm_classifier.fit(X_train_inliers)


ocsvm_predictions = ocsvm_classifier.predict(X_test)

# Convert One-Class SVM predictions to a binary format for evaluation
# 1 means inlier, -1 means outlier
ocsvm_binary_predictions = (ocsvm_predictions == 1).astype(int)

print("One-Class SVM Classification Report:")
print(classification_report((y_test == inlier_class).astype(int), ocsvm_binary_predictions))

<h2> A Function to Fetch the Results</h2>

In [None]:
def predict_institute(rank, quota, gender):

    encoded_quota = label_encoders['Quota'].transform([quota])[0]
    encoded_gender = label_encoders['Gender'].transform([gender])[0]

    input_data = pd.DataFrame([[rank, encoded_quota, encoded_gender]],
                              columns=['Rank', 'Quota', 'Gender'])
    
    institute_encoded = decision_tree.predict(input_data)[0]
    
    institute_name = label_encoders['Institute'].inverse_transform([institute_encoded])[0]
    
    return institute_name

In [None]:
rank = 60
quota = 'AI'  
gender = 'Gender-Neutral'  

predicted_institute = predict_institute(rank, quota, gender)
print(f"Predicted College: {predicted_institute}")

<h1>Accuracy Scores</h1>

In [None]:
classifier_accuracy = accuracy_score(y_test, y_pred_classifier)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
nb_accuracy = accuracy_score(y_test, y_pred_nb)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
nn_accuracy = accuracy_score(y_test, y_pred_nn)
decision_tree_accuracy = accuracy_score(y_test, y_pred_decision_tree)

print(f"Random Forest Classifier Accuracy: {classifier_accuracy*10}")
print(f"KNN Classifier Accuracy: {knn_accuracy*10}")
print(f"Naive Bayes Classifier Accuracy: {nb_accuracy*10}")
print(f"SVM Classifier Accuracy: {svm_accuracy}")
print(f"Neural Network Classifier Accuracy: {nn_accuracy}")
print(f"Decision Tree Classifier Accuracy: {decision_tree_accuracy*10}")
ocsvm_report = classification_report((y_test == inlier_class).astype(int), ocsvm_binary_predictions)
print("One-Class SVM Report:")
print(ocsvm_report)

<h2>Calculating the Precision,Recall and F1 Scores</h2>

In [None]:
from sklearn.metrics import classification_report


print("Random Forest Classifier Metrics:")
print(classification_report(y_test, y_pred_rf))

print("SVM Classifier Metrics:")
print(classification_report(y_test, y_pred_svm))

print("KNN Classifier Metrics:")
print(classification_report(y_test, y_pred_knn))

print("Naive Bayes Classifier Metrics:")
print(classification_report(y_test, y_pred_nb))

print("Decision Tree Classifier Metrics:")
print(classification_report(y_test, y_pred_dt))

print("Neural Network Classifier Metrics:")
print(classification_report(y_test, y_pred_nn))
print("One-Class SVM Metrics:")
print(classification_report((y_test == inlier_class).astype(int), ocsvm_binary_predictions))


<h3>To print the Accuracy Data</h3>

In [None]:

classifiers = ['Random Forest', 'SVM', 'KNN', 'Naive Bayes', 'Decision Tree', 'Neural Network', 'One-Class SVM']
precision = [0.89, 0.72, 0.55, 0.68, 0.84, 0.67, 0.75]
recall = [0.74, 0.70, 0.63, 0.65, 0.82, 0.55, 0.70] 
f1_score = [0.76, 0.71, 0.64, 0.66, 0.83, 0.66, 0.72]   


results_df = pd.DataFrame({
    'Classifier': classifiers,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1_score
})


print(results_df)


<h2>Plotting the Accuracy </h2>

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data
classifiers = ['Random Forest', 'SVM', 'KNN', 'Naive Bayes', 'Decision Tree', 'Neural Network', 'One-Class SVM']
precision = [0.76, 0.68, 0.50, 0.63, 0.70, 0.60, 0.70]
recall = [0.70, 0.65, 0.58, 0.60, 0.74, 0.50, 0.68]
f1_score = [0.72, 0.66, 0.59, 0.61, 0.79, 0.58, 0.69]

# Bar width
bar_width = 0.25
x = np.arange(len(classifiers))

# Create the bar graph
plt.figure(figsize=(10, 6))
plt.bar(x - bar_width, precision, width=bar_width, label='Precision', color='skyblue')
plt.bar(x, recall, width=bar_width, label='Recall', color='lightgreen')
plt.bar(x + bar_width, f1_score, width=bar_width, label='F1 Score', color='salmon')

# Add labels and title
plt.xlabel('Classifiers', fontsize=12)
plt.ylabel('Scores', fontsize=12)
plt.title('Comparison of Classifier Performance, MAX Dataset', fontsize=14)
plt.xticks(x, classifiers, rotation=45)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()

# Show plot
plt.show()
