In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import explained_variance_score, SCORERS
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from pandas_profiling import ProfileReport
from sklearn.ensemble import IsolationForest
from lightgbm import LGBMRegressor

- Goal: We are trying to predict chance of admission based on GRE score, TOEFL score, university ranking, Statement of Purpose and Letter of Recommendation Strength, Undergraduate GPA and Research Experience.

- Scenario: Students who are graduating high school and have to decide which universities to apply to would be interested in this and would find it useful to know their chances of admission.

- Decision: ML is going to help them decide whether or not they should apply to a university based on the predicted chance.

- Split data into train/test

In [58]:
df = pd.read_csv("Admission_Predict_Ver1.1.csv")
df = df.drop(columns=["Serial No."])
df_train, df_test = train_test_split(df, random_state=123)
df_train.head()



Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
455,305,102,2,1.5,2.5,7.64,0,0.59
384,340,113,4,5.0,5.0,9.74,1,0.96
293,312,98,1,3.5,3.0,8.18,1,0.64
421,321,112,3,3.0,4.5,8.95,1,0.77
374,315,105,2,2.0,2.5,7.65,0,0.39


- Perform exploratory data analysis, including outlier detection. 

In [59]:
df_train.shape

(375, 8)

In [60]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375 entries, 455 to 365
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GRE Score          375 non-null    int64  
 1   TOEFL Score        375 non-null    int64  
 2   University Rating  375 non-null    int64  
 3   SOP                375 non-null    float64
 4   LOR                375 non-null    float64
 5   CGPA               375 non-null    float64
 6   Research           375 non-null    int64  
 7   Chance of Admit    375 non-null    float64
dtypes: float64(4), int64(4)
memory usage: 26.4 KB


ProfileReport(df_train)

- From profiling it looks like the data is uniform.

In [61]:
df_train['Chance of Admit '].value_counts()

0.71    18
0.64    13
0.73    13
0.79    13
0.78    12
        ..
0.51     1
0.43     1
0.37     1
0.39     1
0.38     1
Name: Chance of Admit , Length: 61, dtype: int64

- Target columns appears to be uniform as well.

- Overall it looks like there are no outliers to be dealt with.



- Drop features that would not actually be available during deployment.
- Drop features for other reasons, e.g. they would not be useful for predicting the target.
- Provide a brief justification explaining the features you dropped.

- The column 'Serial No' was dropped. All others features for training would be available during deployment as they are 
all necessary for university applications. 'Serial No' was dropped as it serves no purpose in finding the chances of application.

#### Build a preprocessing pipeline.

In [62]:
passthrough_columns = ['University Rating', 'Research']
numeric_columns = ['GRE Score', 'TOEFL Score', 'SOP', 'LOR ', 'CGPA']
preprocessor = make_column_transformer(
    ('passthrough', passthrough_columns),
    (StandardScaler(), numeric_columns)
)


- Try these models:
    1. `DummyRegressor` or `DummyClassifer`, as appropriate
    2. `Ridge` or `LogisticRegression`, as appropriate
    3. `LGBMRegressor` or `LGBMClassifier`, as appropriate 


In [63]:
X_train = df_train.drop(columns=['Chance of Admit '])
X_test = df_test.drop(columns=['Chance of Admit '])

y_train = df_train['Chance of Admit ']
y_test = df_test['Chance of Admit ']

In [64]:
dr = DummyRegressor(strategy='mean')
dr.fit(X_train, y_train)
dr.score(X_train, y_train)

0.0

- ridge pipeline

In [65]:
ridge_pipe = make_pipeline(preprocessor, Ridge(max_iter=1000))
ridge_pipe.fit(X_train, y_train)
ridge_pipe.score(X_train, y_train)

0.8270029998469806

In [66]:
"""from lecture 10"""
def cross_validate_std(*args, **kwargs):
    """Like cross_validate, except also gives the standard deviation of the score"""
    res = pd.DataFrame(cross_validate(*args, **kwargs))
    res_mean = res.mean()

    res_mean["std_test_score"] = res["test_score"].std()
    if "train_score" in res:
        res_mean["std_train_score"] = res["train_score"].std()
    return res_mean

In [67]:
cross_validate_std(ridge_pipe, X_train, y_train, return_train_score=True)

fit_time           0.009385
score_time         0.003118
test_score         0.809094
train_score        0.827622
std_test_score     0.079190
std_train_score    0.019523
dtype: float64

- LGBMRegressor pipeline

In [68]:
lgbm_pipe = make_pipeline(preprocessor, LGBMRegressor())
lgbm_pipe.fit(X_train, y_train)
lgbm_pipe.score(X_train, y_train)

0.9251163737074904

In [69]:
cross_validate_std(lgbm_pipe, X_train, y_train, return_train_score=True)

fit_time           0.020034
score_time         0.007964
test_score         0.743162
train_score        0.924291
std_test_score     0.100543
std_train_score    0.008037
dtype: float64

#### 2(e)
rubric={points:5}

- Hyperparameter tuning.

- Ridge hyperparameter

In [77]:
param_grid = {
    'ridge__alpha': [0.01, 0.1, 1.0, 10, 100]
}
ridge_grid_search = GridSearchCV(ridge_pipe, param_grid, n_jobs=-1, verbose=2, return_train_score=True)
ridge_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    0.1s finished


GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('passthrough',
                                                                         'passthrough',
                                                                         ['University '
                                                                          'Rating',
                                                                          'Research']),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         ['GRE '
                                                                          'Score',
                                                                          'TOEFL '
                                                                          

In [78]:
ridge_grid_search.best_params_

{'ridge__alpha': 10}

- LGBMRegressor hyperparameter

In [79]:
lgbm_grid = {
    'lgbmregressor__num_leaves': [5, 10, 15, 20, 25],
    'lgbmregressor__max_depth': [5, 15, 30],
    'lgbmregressor__n_estimators': [25, 50, 75, 100]
}
lgbm_grid_search = GridSearchCV(lgbm_pipe, lgbm_grid, n_jobs=-1, verbose=2, return_train_score=True)
lgbm_grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.1s finished


GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('passthrough',
                                                                         'passthrough',
                                                                         ['University '
                                                                          'Rating',
                                                                          'Research']),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         ['GRE '
                                                                          'Score',
                                                                          'TOEFL '
                                                                          

In [80]:
lgbm_grid_search.best_params_

{'lgbmregressor__max_depth': 5,
 'lgbmregressor__n_estimators': 50,
 'lgbmregressor__num_leaves': 5}

- comparison:

In [84]:
ridge_grid_search_columns = [
    'mean_test_score', 'mean_train_score', 'mean_fit_time', 'rank_test_score', 'param_ridge__alpha'
]
pd.DataFrame(ridge_grid_search.cv_results_)[ridge_grid_search_columns].sort_values(by=['rank_test_score'])

Unnamed: 0,mean_test_score,mean_train_score,mean_fit_time,rank_test_score,param_ridge__alpha
3,0.809793,0.826888,0.015627,1,10.0
2,0.809094,0.827622,0.015108,2,1.0
1,0.80893,0.827631,0.016931,3,0.1
0,0.808912,0.827632,0.006251,4,0.01
4,0.798007,0.809817,0.012815,5,100.0


In [85]:
lgbm_grid_search_columns = [
    'mean_test_score', 'mean_train_score', 'mean_fit_time', 'rank_test_score', 'param_lgbmregressor__max_depth', 'param_lgbmregressor__n_estimators', 'param_lgbmregressor__num_leaves'
]
pd.DataFrame(lgbm_grid_search.cv_results_)[lgbm_grid_search_columns].sort_values(by=['rank_test_score']).head()



Unnamed: 0,mean_test_score,mean_train_score,mean_fit_time,rank_test_score,param_lgbmregressor__max_depth,param_lgbmregressor__n_estimators,param_lgbmregressor__num_leaves
5,0.778603,0.867774,0.015724,1,5,50,5
45,0.778603,0.867774,0.016118,1,30,50,5
25,0.778603,0.867774,0.014087,1,15,50,5
0,0.776068,0.84434,0.018229,4,5,25,5
20,0.776068,0.84434,0.019319,4,15,25,5


- though LGBMRegressor has a higher mean train score for the hyper-parameters, Ridge has a better mean test score and a smaller difference between the mean train and mean test scores.

- Look at the sub-scores from the different folds of cross-validation.

In [86]:
alpha_ridge_pipe = make_pipeline(preprocessor, Ridge(alpha=10))
cross_validate_std(alpha_ridge_pipe, X_train, y_train, return_train_score=True)

fit_time           0.009383
score_time         0.000000
test_score         0.809793
train_score        0.826888
std_test_score     0.077049
std_train_score    0.019464
dtype: float64

- The train and cross validation scores are very close and the standard deviations for the test and train score are also very low.
Given this we can trust the scores.

In [87]:
alpha_ridge_pipe.fit(X_train, y_train)
explained_variance_score(y_train, alpha_ridge_pipe.predict(X_train))

0.8265163700272344

In [88]:
cross_val_score(alpha_ridge_pipe, X_train, y_train, scoring='explained_variance').mean()

0.8125502871224197

- The explained variance train and cross validation score are also very close.

- Evaluate model on the test set assessment of your model's deployment performance.

In [89]:
alpha_ridge_pipe.score(X_test, y_test)

0.8012355846266312

In [90]:
coefficients = pd.DataFrame(data=alpha_ridge_pipe[1].coef_, index=X_train.columns, columns=["Coefficient"])
coefficients.sort_values(by="Coefficient", ascending=False)

Unnamed: 0,Coefficient
Research,0.06666
University Rating,0.027087
TOEFL Score,0.022123
SOP,0.019784
CGPA,0.014933
GRE Score,0.004265
LOR,0.000519


- The test score is very close to the train and cross validation scores, so we can trust this score.

- the feature coefficients also make sense as research experience and university rating are being marked as the most important features and then the test scores come next.

- Though the scores are very close and reliable, the dataset was very small and there only seven features were used to train the model.
If the model is used in a similarly simplistic manner, it should give accurate deployment performance, however real-world decision making should not solely rely on the model's output.