In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/chance-of-admission-data/Admission_Predict.csv


In [3]:
data = pd.read_csv('/kaggle/input/chance-of-admission-data/Admission_Predict.csv')

In [4]:
data.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,3.114,3.374,3.484,8.57644,0.56,0.72174
std,144.481833,11.295148,6.081868,1.143512,0.991004,0.92545,0.604813,0.496884,0.14114
min,1.0,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,125.75,308.0,103.0,2.0,2.5,3.0,8.1275,0.0,0.63
50%,250.5,317.0,107.0,3.0,3.5,3.5,8.56,1.0,0.72
75%,375.25,325.0,112.0,4.0,4.0,4.0,9.04,1.0,0.82
max,500.0,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


In [5]:
correlation = data.corr()

In [7]:
correlation['Chance of Admit']

Serial No.           0.008505
GRE Score            0.810351
TOEFL Score          0.792228
University Rating    0.690132
SOP                  0.684137
LOR                  0.645365
CGPA                 0.882413
Research             0.545871
Chance of Admit      1.000000
Name: Chance of Admit, dtype: float64

In [8]:
def lr(X, y) :  
  # Split the data into training (80%) and testing (20%) sets
  train_size = int(0.8 * len(data))
  X_train, X_test = X[:train_size], X[train_size:]
  y_train, y_test = y[:train_size], y[train_size:]

  # Add a bias term to the features (X)
  X_train = np.c_[np.ones(X_train.shape[0]), X_train]
  X_test = np.c_[np.ones(X_test.shape[0]), X_test]

  # Initialize the coefficients (weights) with zeros
  theta = np.zeros(X_train.shape[1])

  # Define the learning rate and number of iterations
  learning_rate = 0.0001
  iterations = 100

  # Gradient Descent to optimize the model
  for i in range(iterations):
      # Calculate predictions
      predictions = np.dot(X_train, theta)
    
      # Calculate the error
      error = predictions - y_train
    
      # Calculate the gradient
      gradient = np.dot(X_train.T, error) / len(y_train)
    
      # Update the coefficients using gradient descent
      theta -= learning_rate * gradient

  # Predict on the test set
  y_pred = np.dot(X_test, theta)

  # Calculate SSE (Sum of Squared Errors)
  sse = np.sum((y_test - y_pred) ** 2)

  # Calculate MSE (Mean Squared Error)
  mse = sse / len(y_test)

  # Calculate R-squared (R2) score
  y_mean = np.mean(y_test)
  sst = np.sum((y_test - y_mean) ** 2)
  r2 = 1 - (sse / sst)

  # Print the coefficients
  print("Coefficients (Theta):", theta)

  # Print SSE, MSE, and R2 scores for the test set
  print("SSE:", sse)
  print("MSE:", mse)
  print("R2 Score:", r2)

In [11]:
X = data.drop(['Chance of Admit'], axis=1)
y = data['Chance of Admit']

In [12]:
lr(X, y)

Coefficients (Theta): [-1.18851170e+111 -2.60141716e+113 -3.76679809e+113 -1.27656159e+113
 -3.65733467e+111 -4.02903778e+111 -4.10454267e+111 -1.02309291e+112
 -6.53890575e+110]
SSE: 6.237825600282133e+234
MSE: 6.237825600282133e+232
R2 Score: -3.4422597484851455e+234


In [13]:
dataset = pd.get_dummies(data, columns=['University Rating', 'SOP',
                                       'LOR', 'Research'])
dataset.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,CGPA,Chance of Admit,University Rating_1,University Rating_2,University Rating_3,University Rating_4,University Rating_5,...,LOR_1.5,LOR_2.0,LOR_2.5,LOR_3.0,LOR_3.5,LOR_4.0,LOR_4.5,LOR_5.0,Research_0,Research_1
0,1,337,118,9.65,0.92,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
1,2,324,107,8.87,0.76,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
2,3,316,104,8.0,0.72,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
3,4,322,110,8.67,0.8,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,True
4,5,314,103,8.21,0.65,False,True,False,False,False,...,False,False,False,True,False,False,False,False,True,False


In [14]:
dataset.describe()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,CGPA,Chance of Admit
count,500.0,500.0,500.0,500.0,500.0
mean,250.5,316.472,107.192,8.57644,0.72174
std,144.481833,11.295148,6.081868,0.604813,0.14114
min,1.0,290.0,92.0,6.8,0.34
25%,125.75,308.0,103.0,8.1275,0.63
50%,250.5,317.0,107.0,8.56,0.72
75%,375.25,325.0,112.0,9.04,0.82
max,500.0,340.0,120.0,9.92,0.97


In [15]:
correlation = dataset.corr()

In [16]:
correlation['Chance of Admit']

Serial No.             0.008505
GRE Score              0.810351
TOEFL Score            0.792228
CGPA                   0.882413
Chance of Admit        1.000000
University Rating_1   -0.305903
University Rating_2   -0.393661
University Rating_3   -0.092499
University Rating_4    0.292087
University Rating_5    0.487790
SOP_1.0               -0.143354
SOP_1.5               -0.285291
SOP_2.0               -0.287613
SOP_2.5               -0.207673
SOP_3.0               -0.133841
SOP_3.5               -0.031776
SOP_4.0                0.201548
SOP_4.5                0.345386
SOP_5.0                0.350635
LOR_1.0               -0.095800
LOR_1.5               -0.182682
LOR_2.0               -0.346484
LOR_2.5               -0.191821
LOR_3.0               -0.187668
LOR_3.5                0.004148
LOR_4.0                0.144724
LOR_4.5                0.296658
LOR_5.0                0.356645
Research_0            -0.545871
Research_1             0.545871
Name: Chance of Admit, dtype: float64

In [19]:
X = dataset[['GRE Score', 'TOEFL Score', 'CGPA']]
y = dataset['Chance of Admit']

In [20]:
lr(X, y)

Coefficients (Theta): [-5.37116109e+95 -1.70392114e+98 -5.77989457e+97 -4.62881655e+96]
SSE: 3.589975901689675e+203
MSE: 3.5899759016896747e+201
R2 Score: -1.981079680050546e+203
