## Contact Codes

Trying to predict the likelihood of enrollment based solely on previous contacts with the college.
These contacts include things such as: 
* campus visits
* meeting at a college fair
* talking with a recruiter at their high school

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('fivethirtyeight') 

import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.insert(0, '../src/visualization/')
import visualize as vis

from tqdm import tqdm_notebook

Load Contact Codes Data Sets and the Application Data Set.

The Contact Codes Data Set was altered such that all date-like attributes are recorded solely by the total days since 0 A.D.  Both files are included in the analysis.

Fields of Interest:
* Admission_application_date
* Admission_date
* Enrolled?
* Unique_student_ID

In [None]:
codes_dates = pd.read_csv('../data/processed/Qry_Contacts_xtab.csv')
codes_days = pd.read_csv('../data/processed/Qry_Contacts_xtab_days.csv')

df = pd.read_csv(
    '../data/processed/CriticalPath_Data_EM_Confidential_lessNoise.csv').drop(columns='Unnamed: 0')[[
    "Unique_student_ID","Enrolled","Admission_application_date","WeightatAcpt","TotalWeight"
]]

df['Admission_application_date'] = pd.to_datetime(df['Admission_application_date'])

months_to_days = {
1:0,
2:31,
3:59,
4:90,
5:120,
6:151,
7:181,
8:212,
9:243,
10:273,
11:304,
12:334
}


df['Admission_application_date_asDays'] = df['Admission_application_date'].dt.day + df['Admission_application_date'].dt.month.map(months_to_days) + df['Admission_application_date'].dt.year*365

Take only the students that are in the `CriticalPath_Data` file.

In [None]:
codes_days = codes_days.where(
    df.Unique_student_ID.isin(codes_days.Unique_student_ID)).dropna(subset=['Unique_student_ID'])

Merge the files together such that whether or not a student enrolls is linkable to all of the contact codes.

In [None]:
data = pd.merge(codes_days,df,how='left',on="Unique_student_ID")

for col in codes_days.columns.values:
    if col!='Unique_student_ID':
        data[col] = data[col][data[col] < data['Admission_application_date_asDays']]

## Create a shadow matrix of the data.

In [None]:
shadow_matrix = ~data.isna().drop(columns=["Unique_student_ID","Admission_application_date","Enrolled","WeightatAcpt","Admission_application_date_asDays","TotalWeight"]).astype(int)+2

Ned had previously said that the contact code weights in the WeightAtAccpt column were determined by the first letter of the contact code.

Therefore, take the sum of interactions for each letter, and use these columns in place of the original contact codes.

In [None]:
for letter in "ABCDEFGHIJKLMNOPQRSTVUWXYZ":
    filter_col = [col for col in shadow_matrix if col.startswith(letter)]
    if len(filter_col)>0:
        shadow_matrix[letter] = shadow_matrix[filter_col].T.sum().T
        shadow_matrix = shadow_matrix.drop(columns=filter_col)

Some contact codes do not even appear and should be discounted.  Only look at the fields with greater than 1000 students with interactions.

In [None]:
shadow_matrix = shadow_matrix[shadow_matrix.columns[shadow_matrix[shadow_matrix>0].count()>1000]]

In [None]:
# pd.merge(shadow_matrix,data['Unique_student_ID'],left_index=True,right_index=True).to_csv('../data/processed/contact_codes_at_application.csv')

## PCA

In [None]:
# scaling the data before PCA
from sklearn.preprocessing import scale
scaled = pd.DataFrame(scale(shadow_matrix),columns=shadow_matrix.columns.values)

# implementing PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=6).fit(scaled)
pca_samples = pca.transform(scaled)

In [None]:
pca_results = vis.pca_results(scaled, pca)

In [None]:
pca_results.cumsum()

## Scree Plot

In [None]:
#Explained variance
vis.plot_explained_variance_ratio(pca);

# Biplot 

In [None]:
vis.biplot(shadow_matrix, scaled, pca);

### Linear Regression: Target of WeightAtAcpt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
y = data['WeightatAcpt']
X = shadow_matrix[~y.isna()]
y = y[~y.isna()]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

# Feature Importance

In [None]:
feature_importance = pd.Series(reg.coef_)
feature_importance.index = shadow_matrix.columns.values

feature_importance.plot(kind='barh');
plt.xlabel("Feature Weight")
plt.ylabel("Feature");

In [None]:
reg = vis.residual_error(X_train, X_test, y_train, y_test)
plt.ylim(-3000,3000);
plt.xlim(1500);
plt.xticks(rotation=70);

MAPE = 1/len(y) * abs(y-reg.predict(X.values))/y

print("The Mean Absolute Percentage Error is: %.2f percent" % (MAPE.sum()*100))

### How good of a predictor of Enrollment is Weight???

First plot the distributions against each other.

In [None]:
f = plt.figure(figsize=(12,6))

f.add_subplot(1,2,1)
sns.stripplot(data=data,x="Enrolled",y="WeightatAcpt",size=1);
plt.ylim(800,3000)

f.add_subplot(1,2,2)
sns.boxplot(data=data,x="Enrolled",y="WeightatAcpt",showfliers=False);
plt.ylim(800,3000);

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_gaussian_quantiles

In [None]:
y = data['Enrolled'].fillna(False).astype(int)
X = shadow_matrix[~y.isna()]
y = y.values
X = np.array(data['WeightatAcpt'].fillna(0)).reshape(-1,1)

class_names = 'AB'
plot_colors = 'br'
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bdt.fit(X, y)

# Plot the two-class decision scores
twoclass_output = bdt.decision_function(X)
plot_range = (twoclass_output.min(), twoclass_output.max())
plt.subplots(figsize=(10,6))

status = ['Unlikely to Enroll','Likely to Enroll']
for i, n, c in zip(range(2), class_names, plot_colors):
    plt.hist(twoclass_output[y == i],
             bins=20,
             range=plot_range,
             facecolor=c,
             label=f'{status[i]}',
             alpha=.5,
             edgecolor='k',
            density=True)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, y1, y2 * 1.2))
plt.legend(loc='upper right')
plt.ylabel('Samples')
plt.xlabel('Score')
plt.title('Decision Scores')

plt.tight_layout()
plt.xlim(-0.25,-0.05)
plt.xticks([]);
# plt.subplots_adjust(wspace=0.5)