There are four relationships we are interested in modelling:

 1. The amount charged for room and board, expressed as a function of the number of: accepted students

 2. The number of enrolled students per college, expressed as a function of the number of: accepted students
 3. The number of failed undergraduate students per college, expressed as a function of: the number of accepted students
 4.  The amount charged for room and board coupled with the number of enrolled students, expressed as a function of: the number of
    accepted students.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
X = pd.read_csv("../input/College.csv", index_col=0)
X.info()

In [None]:
X.head()

The .map() method is like .apply(), but instead of taking in a lambda / function, you simply provide a mapping of keys:values.

In [None]:
X.Private = X.Private.map({'Yes':1, 'No':0})

# 1. Accepted students by room charge

The first relationship we're interested in is the  number of accepted students, as a function of the amount charged for room and board.

In [None]:
X_rb  = X['Room.Board'] # series
y = X['Accept']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_rb, y, 
                                                    test_size=0.3, random_state=7)

In [None]:
from sklearn import linear_model
model = linear_model.LinearRegression()

In [None]:
# fit(), score() and predict() expect 2d arrays
model.fit(X_train.values.reshape(-1,1), y_train)
X_test = X_test.values.reshape(-1,1)
score = model.score(X_test, y_test)
score

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

matplotlib.style.use('ggplot') # Look Pretty


def drawLine(model, X_test, y_test, title, R2):
  # This convenience method will take care of plotting the
  # test observations, comparing them to the regression line,
  # and displaying the R2 coefficient
  fig = plt.figure()
  ax = fig.add_subplot(111)
  ax.scatter(X_test, y_test, c='g', marker='o')
  ax.plot(X_test, model.predict(X_test), color='orange', linewidth=1, alpha=0.7)

  title += " R2: " + str(R2)
  ax.set_title(title)
  print (title)
  print ("Intercept(s): ", model.intercept_)

  plt.show()


In [None]:
drawLine(model, X_test, y_test, "Accept(Room&Board)", score)

# 2. Enrolled vs. accepted students

Model the number of accepted students, as a function of the number of enrolled students per college.`enter code here`

In [None]:
X_en  = X['Enroll'] # series
X_train, X_test, y_train, y_test = train_test_split(X_en, y, 
                                                    test_size=0.3, random_state=7)

model.fit(X_train.values.reshape(-1,1), y_train)
X_test = X_test.values.reshape(-1,1)
score = model.score(X_test, y_test)

drawLine(model, X_test, y_test, "Accept(Enroll)", score)

# 3. accepted vs. failed undergraduate students

In [None]:
X_fu  = X['F.Undergrad'] # series
X_train, X_test, y_train, y_test = train_test_split(X_fu, y, 
                                                    test_size=0.3, random_state=7)

model.fit(X_train.values.reshape(-1,1), y_train)
X_test = X_test.values.reshape(-1,1)
score = model.score(X_test, y_test)

drawLine(model, X_test, y_test, "Accept(F.Undergrad)", score)


# 4. accepted students by enrolled students AND room charge

In [None]:
X_rb_en = X[['Room.Board', 'Enroll']] # data frame
X_train, X_test, y_train, y_test = train_test_split(X_rb_en, y, 
                                                    test_size=0.3, random_state=7)

model.fit(X_train, y_train)
score = model.score(X_test, y_test)
score

In [None]:
import numpy as np

def drawPlane(model, X_test, y_test, title, R2):
  # This convenience method will take care of plotting the
  # test observations, comparing them to the regression plane,
  # and displaying the R2 coefficient
  fig = plt.figure()
  ax = Axes3D(fig)
  ax.set_zlabel('prediction')

  # You might have passed in a DataFrame, a Series (slice),
  # an NDArray, or a Python List... so let's keep it simple:
  X_test = np.array(X_test)
  col1 = X_test[:,0]
  col2 = X_test[:,1]

  # Set up a Grid. We could have predicted on the actual
  # col1, col2 values directly; but that would have generated
  # a mesh with WAY too fine a grid, which would have detracted
  # from the visualization
  x_min, x_max = col1.min(), col1.max()
  y_min, y_max = col2.min(), col2.max()
  x = np.arange(x_min, x_max, (x_max-x_min) / 10)
  y = np.arange(y_min, y_max, (y_max-y_min) / 10)
  x, y = np.meshgrid(x, y)

  # Predict based on possible input values that span the domain
  # of the x and y inputs:
  z = model.predict(  np.c_[x.ravel(), y.ravel()]  )
  z = z.reshape(x.shape)

  ax.scatter(col1, col2, y_test, c='g', marker='o')
  ax.plot_wireframe(x, y, z, color='orange', alpha=0.7)
  
  title += " R2: " + str(R2)
  ax.set_title(title)
  print (title)
  print ("Intercept(s): ", model.intercept_)
  
  plt.show()


In [None]:
drawPlane(model, X_test, y_test, "Accept(Room&Board,Enroll)", score)