In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv("college_data.csv")
df

Unnamed: 0,percentile,branch,gender,category,secondary_seat_type,score_type,city,college_name
0,90.121473,civil engineering,m,nt 2 (nt-c),state level seats,mht-cet,amravati,"government college of engineering, amravati"
1,89.889223,civil engineering,f,sc,state level seats,mht-cet,amravati,"government college of engineering, amravati"
2,89.540152,civil engineering,m,obc,state level seats,mht-cet,amravati,"government college of engineering, amravati"
3,88.241971,civil engineering,m,obc,state level seats,mht-cet,amravati,"government college of engineering, amravati"
4,88.091617,civil engineering,m,dt/vj,state level seats,mht-cet,amravati,"government college of engineering, amravati"
...,...,...,...,...,...,...,...,...
104281,48.939024,computer science and engineering (artificial i...,f,open,economically weaker section seats,mht-cet,solapur,shree siddheshwar women's college of engineeri...
104282,35.659886,computer science and engineering (artificial i...,f,open,economically weaker section seats,mht-cet,solapur,shree siddheshwar women's college of engineeri...
104283,29.937274,computer science and engineering (artificial i...,f,open,economically weaker section seats,mht-cet,solapur,shree siddheshwar women's college of engineeri...
104284,29.937274,computer science and engineering (artificial i...,f,open,economically weaker section seats,mht-cet,solapur,shree siddheshwar women's college of engineeri...


In [3]:
label_encoder = LabelEncoder()

### Encode Categorical features and target variable

In [4]:
encoded_features = {}

for feature in ['branch', 'gender', 'category', 'secondary_seat_type', 'score_type', 'city'] :
    df[feature] = label_encoder.fit_transform(df[feature])
    #storing the mappings in a dictionary
    encoded_features[feature] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [5]:
encoded_features

{'branch': {'aeronautical engineering': 0,
  'agricultural engineering': 1,
  'agriculture engineering': 2,
  'artificial intelligence': 3,
  'artificial intelligence (ai) and data science': 4,
  'artificial intelligence and data science': 5,
  'artificial intelligence and data science university , jalgaon': 6,
  'artificial intelligence and machine learning': 7,
  'automation and robotics': 8,
  'automobile engineering': 9,
  'automotive technology': 10,
  'bio medical engineering': 11,
  'bio technology': 12,
  'chemical engineering': 13,
  'civil and environmental engineering': 14,
  'civil and infrastructure engineering': 15,
  'civil engineering': 16,
  'computer engineering': 17,
  'computer engineering (regional language)': 18,
  'computer science and business systems': 19,
  'computer science and design': 20,
  'computer science and engineering': 21,
  'computer science and engineering (artificial intelligence and data science)': 22,
  'computer science and engineering (artific

In [6]:
#encode target variable
df['college_name'] = label_encoder.fit_transform(df['college_name'])

#storing the mappings in a dictionary
encoded_features['college_name'] = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [7]:
encoded_features

{'branch': {'aeronautical engineering': 0,
  'agricultural engineering': 1,
  'agriculture engineering': 2,
  'artificial intelligence': 3,
  'artificial intelligence (ai) and data science': 4,
  'artificial intelligence and data science': 5,
  'artificial intelligence and data science university , jalgaon': 6,
  'artificial intelligence and machine learning': 7,
  'automation and robotics': 8,
  'automobile engineering': 9,
  'automotive technology': 10,
  'bio medical engineering': 11,
  'bio technology': 12,
  'chemical engineering': 13,
  'civil and environmental engineering': 14,
  'civil and infrastructure engineering': 15,
  'civil engineering': 16,
  'computer engineering': 17,
  'computer engineering (regional language)': 18,
  'computer science and business systems': 19,
  'computer science and design': 20,
  'computer science and engineering': 21,
  'computer science and engineering (artificial intelligence and data science)': 22,
  'computer science and engineering (artific

In [8]:
#Define feature and target variable

X = df.drop('college_name', axis=1)
y = df['college_name']

In [9]:
#Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#Initializing Decision Tree Classifier

decision_tree_classifier = DecisionTreeClassifier(random_state=42)

In [11]:
#Fit model to the training data

decision_tree_classifier.fit(X_train, y_train)

In [12]:
#Make Predictions

predictions = decision_tree_classifier.predict(X_test)

In [13]:
#Evaluate the model 

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
print(f"Decision Tree Accuracy : {accuracy*100}")

Decision Tree Accuracy : 58.76402339629878


In [14]:
def encode_input_data(input_data, mappings):
    encoded_data = input_data.copy()
    for feature in mappings:
        if feature in encoded_data:
            encoded_data[feature] = encoded_data[feature].map(mappings[feature])
    return encoded_data

In [15]:
input_data = pd.DataFrame({
    'percentile': [99],
    'branch': ['computer science and engineering'],
    'gender': ['m'],
    'category': ['open'],
    'secondary_seat_type': ['all india seats allotted to all india candidature candidates with jee(main) score'],
    'score_type': ['mht-cet'],
    'city': ['pune']
})

In [17]:
encoded_input = encode_input_data(input_data, encoded_features)
encoded_input

Unnamed: 0,percentile,branch,gender,category,secondary_seat_type,score_type,city
0,99,21,1,47,0,2,65


In [18]:
#Make predictions for the encoded_input_data using trained Decision Tree Model
encoded_predictions = decision_tree_classifier.predict(encoded_input)
encoded_predictions

array([203])

In [19]:
#'encoded_predictions' now contains the predicted college labels in their encoded form.
#To decode the predicted labels to their original names, you can use the 'encoded_features' dictionary.

decoded_predictions = [list(encoded_features['college_name'].keys())[list(encoded_features['college_name'].values()).index(pred)] for pred in encoded_predictions]
decoded_predictions

['pune institute of computer technology, dhankavdi, pune']

In [20]:
print(f"Predicted College Name : {decoded_predictions}")

Predicted College Name : ['pune institute of computer technology, dhankavdi, pune']


In [21]:
import pickle

#save the encoded_features dictionary to a file
with open('encoded_features.pkl', 'wb') as file:
    pickle.dump(encoded_features, file)


#save the trained Decision Tree Classifier to a file
with open('decision_tree_classifier.pkl', 'wb') as file:
    pickle.dump(decision_tree_classifier, file)

In [6]:
df['branch'].unique().asc()

AttributeError: 'numpy.ndarray' object has no attribute 'asc'