In [1]:
import pandas as pd
data_path = '/Users/nathanyap/Desktop/DataMining_Project/project/Nathan Findings/TOEFL_IELTS_Combined.csv'
df_admitsFYI = pd.read_csv(data_path)

## Start with Ranking the top 5 instances

In [2]:
university_counts = df_admitsFYI['University'].value_counts()
university_counts

University of Maryland, College Park           2944
Clemson University                             2350
Carnegie Mellon University                     2120
Georgia Institute of Technology                1878
State University of New York, Buffalo          1736
                                               ... 
South Dakota School of Mines and Technology       4
Jacobs University Bremen                          4
Johnson and Wales University, Providence          2
Hult International Business School, Boston        1
Lewis University                                  1
Name: University, Length: 75, dtype: int64

### Okay we can see here that the top 5 are 
### 1) University of Maryland, College Park 
### 2) Clemson University 
### 3) Carnegie Mellon University
### 4) Georgia Tech
### 5) State Uni of NY

In [5]:
top_universities = university_counts.head(5).index.tolist()
filtered_data = df_admitsFYI[df_admitsFYI['University'].isin(top_universities)]

filtered_data

Unnamed: 0,University,Status,Target Major,GRE Verbal,GRE Quantitative,GRE Writing,GRE Total,UG College,UG Major,GPA,Papers,Work Exp,Season,Year,TOEFL/IELTS
781,Carnegie Mellon University,0,Aerospace Engineering,162.0,152.0,3.5,314.0,Amrita Coimbatore,Mechanical Engineering,3.2000,0.0,0,Fall,2013,8.2
782,Carnegie Mellon University,1,Architecture,160.0,150.0,3.0,310.0,NIT Bhopal,Architecture,3.3200,0.0,0,Fall,2020,8.3
783,Carnegie Mellon University,1,Architecture,154.0,166.0,5.0,320.0,NIT Trichy,Architecture,3.0400,0.0,0,Fall,2019,9.2
784,Carnegie Mellon University,1,Architecture,163.0,153.0,3.0,316.0,JADAVPUR UNIVERSITY,Architecture,3.5440,0.0,30,Fall,2020,8.9
785,Carnegie Mellon University,1,Architecture,156.0,149.0,3.5,305.0,Gujarat Technological University,Architecture,3.7040,0.0,6,Fall,2020,7.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32689,Georgia Institute of Technology,1,Telecommunication Engineering,164.0,161.0,4.0,325.0,K. J. Somaiya College of Engineering,Electronics and Telecommunication,2.6192,0.0,0,Fall,2014,9.7
32690,Georgia Institute of Technology,1,Telecommunication Engineering,161.0,157.0,4.0,318.0,University of Mumbai,Electronics and Telecommunication,3.0400,0.0,0,Fall,2013,9.2
32691,Georgia Institute of Technology,0,Telecommunication Engineering,164.0,154.0,3.5,318.0,University of Mumbai,Electronics and Telecommunication,3.0136,0.0,0,Fall,2012,8.2
32692,Georgia Institute of Technology,0,Telecommunication Engineering,166.0,156.0,3.0,322.0,UNIVERSITY OF MUMBAI,Electronics and Telecommunication,2.6800,0.0,0,Fall,2012,8.9


In [8]:
# This is just to make sure that there are only 5 unique university values in the list now
filtered_data.shape, filtered_data['University'].unique()

((11028, 15),
 array(['Carnegie Mellon University',
        'University of Maryland, College Park', 'Clemson University',
        'State University of New York, Buffalo',
        'Georgia Institute of Technology'], dtype=object))

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# store everyhting in the dictionary
results = {}

# features and target
features = ['GPA', 'GRE Total', 'TOEFL/IELTS', 'Work Exp', 'Papers']
target = 'Status'

# we want to loop each of the 5 universities
for university in top_universities:
    # Filter the dataset for the current university
    uni_data = filtered_data[filtered_data['University'] == university]
    
    # Splitting the dataset into features and target variable
    X = uni_data[features]
    y = uni_data[target]
    
    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Training the Decision Tree model
    dt_model = DecisionTreeClassifier(random_state=42)
    dt_model.fit(X_train, y_train)
    
    # Predicting on the testing set
    y_pred = dt_model.predict(X_test)
    
    # Calculating evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    
    # Storing the results
    results[university] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall
    }

    
formatted_results = ""

for university, metrics in results.items():
    print(university)
    for metric, value in metrics.items():
        print(f"{metric} is {round(value * 100, 2)}%")
    print()

# results

University of Maryland, College Park
Accuracy is 95.81%
Precision is 97.19%
Recall is 96.71%

Clemson University
Accuracy is 98.58%
Precision is 98.47%
Recall is 98.97%

Carnegie Mellon University
Accuracy is 86.95%
Precision is 92.26%
Recall is 82.04%

Georgia Institute of Technology
Accuracy is 87.23%
Precision is 79.41%
Recall is 78.49%

State University of New York, Buffalo
Accuracy is 91.75%
Precision is 93.67%
Recall is 95.77%



In [22]:
# Finding the top 5 majors based on instances for each of the top 5 universities
top_majors_per_university = {}

for university in top_universities:
    # Filter the dataset for the current university
    uni_data = filtered_data[filtered_data['University'] == university]
    # Count the instances of each major
    major_counts = uni_data['Target Major'].value_counts().head(5)
    # Store the results
    top_majors_per_university[university] = major_counts
    
for university, majors in top_majors_per_university.items():
    print(f"{university}:")
    for major, count in majors.items():
        print(f"- {major}: {count} instances")
    print() 

# top_majors_per_university

University of Maryland, College Park:
- Telecommunications Engineering: 300 instances
- Mechanical Engineering: 300 instances
- MIS: 292 instances
- Robotics: 280 instances
- Software Engineering: 260 instances

Clemson University:
- Civil Engineering: 312 instances
- Computer Science: 304 instances
- Mechanical Engineering: 296 instances
- Industrial Engineering: 296 instances
- Electrical Engineering: 292 instances

Carnegie Mellon University:
- Chemical Engineering: 156 instances
- Data Science: 154 instances
- Cyber Security: 148 instances
- Mechanical Engineering: 146 instances
- Robotics: 146 instances

Georgia Institute of Technology:
- Cyber Security: 166 instances
- Aerospace Engineering: 152 instances
- Industrial Engineering: 152 instances
- EECS: 150 instances
- Civil Engineering: 150 instances

State University of New York, Buffalo:
- Robotics: 170 instances
- Civil Engineering: 152 instances
- Data Science: 150 instances
- Computer Science: 148 instances
- MIS: 146 instan

In [24]:
# Dictionary to hold the results for each scenario
scenario_results = {}

# Process each university and its top 5 majors
for university in top_majors_per_university:
    for major, _ in top_majors_per_university[university].items():
        # Filter the dataset for the current university and major
        scenario_data = filtered_data[(filtered_data['University'] == university) & (filtered_data['Target Major'] == major)]
        
        # Splitting the dataset into features and target variable
        X = scenario_data[features]
        y = scenario_data[target]
        
        # Splitting the dataset into training and testing sets, ensuring there's enough data to split
        if len(scenario_data) >= 10:  # Ensuring there's enough data to split
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
            
            # Training the Decision Tree model
            dt_model = DecisionTreeClassifier(random_state=42)
            dt_model.fit(X_train, y_train)
            
            # Predicting on the testing set
            y_pred = dt_model.predict(X_test)
            
            # Calculating evaluation metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='binary')
            recall = recall_score(y_test, y_pred, average='binary')
        else:
            accuracy, precision, recall = None, None, None  # Indicating insufficient data for a split
        
        # Storing the results
        scenario_key = f"{university} - {major}"
        scenario_results[scenario_key] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall
        }

        
for scenario, metrics in scenario_results.items():
    # Converting decimal scores to percentages with 2 decimal places
    accuracy = f"{metrics['Accuracy'] * 100:.2f}%" if metrics['Accuracy'] is not None else "N/A"
    precision = f"{metrics['Precision'] * 100:.2f}%" if metrics['Precision'] is not None else "N/A"
    recall = f"{metrics['Recall'] * 100:.2f}%" if metrics['Recall'] is not None else "N/A"
    
    # Printing out formatted results
    print(f"{scenario}:")
    print(f"  Accuracy: {accuracy}")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print()
    
    
# scenario_results

University of Maryland, College Park - Telecommunications Engineering:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

University of Maryland, College Park - Mechanical Engineering:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

University of Maryland, College Park - MIS:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

University of Maryland, College Park - Robotics:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

University of Maryland, College Park - Software Engineering:
  Accuracy: 94.87%
  Precision: 84.62%
  Recall: 100.00%

Clemson University - Civil Engineering:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

Clemson University - Computer Science:
  Accuracy: 95.65%
  Precision: 100.00%
  Recall: 93.55%

Clemson University - Mechanical Engineering:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

Clemson University - Industrial Engineering:
  Accuracy: 100.00%
  Precision: 100.00%
  Recall: 100.00%

Clemson

In [25]:
import joblib
joblib.dump(dt_model, 'dt_model_university_major.joblib')

['dt_model_university_major.joblib']

In [29]:
loaded_model = joblib.load('dt_model_university_major.joblib')
student_df = pd.DataFrame({
    'GPA': [3.5],
    'GRE Total': [320],
    'TOEFL/IELTS': [7.8],
    'Work Exp': [10],
    'Papers': [1]
})

prediction = loaded_model.predict(student_df)

In [30]:
prediction

array([1])