# Multivariate Linear Regression to predict UG Score

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Acquiring Data

The csv file contains student's details and academic credentials upto the 6<sup>th</sup> semester for the 2019 batch.

In [5]:
data = pd.read_csv("batch2019.csv", index_col=["S.No"])

In [6]:
data.head()

Unnamed: 0_level_0,Register Number,Student Name,Degree,Branch,Campus,Mobile Number,E mail ID,10 th %,12 th / Diploma %,UG %,Number of Backlogs,No. of History of Arrears
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,RA1511004010385,ANINDYA BASU,B. TECH,ECE,SRM IST,8240479070,anindyabasu_ama@srmuniv.edu.in,89.6,91.75,83.67,0,0
2,RA1511004010398,SHREYA CHAUDHARY,B. TECH,ECE,SRM IST,9566185403,shreyachaudhary_sud@srmuniv.edu.in,93.1,80.0,80.11,0,0
3,RA1511005010147,SHAYAN BANIK,B. Tech,EEE,SRM IST,8697353681,Shayanbanik_ni@srmuniv.edu.in,68.4,75.2,86.98,0,0
4,RA1511005010306,Anant Tiwari,B. Tech,EEE,SRM IST,7092928861,ananttiwari_vi@gmail.com,71.0,84.6,81.73,0,0
5,RA1511005010081,R.Ashwin,B. Tech,EEE,SRM IST,9940638271,rashwin_sr@srmuniv.edu.in,71.4,92.4,75.87,0,0


In [7]:
data.shape

(2619, 12)

## Data Preprocessing

The UG Percentage varies depending upon curriculum and department size, so I'll evaluate for my department in particular. 

In [8]:
data_IT = data.loc[data.Branch=='IT']

In [10]:
data_IT.head(10)

Unnamed: 0_level_0,Register Number,Student Name,Degree,Branch,Campus,Mobile Number,E mail ID,10 th %,12 th / Diploma %,UG %,Number of Backlogs,No. of History of Arrears
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1191,RA1511008020089,PRINCE KUMAR SINGH,B.Tech,IT,SRM IST,7358329889,125princekumar@gmail.com,7.0,63.0,82.91,0,NIL
1192,RA1511008020182,Shubham singh,B.Tech,IT,SRM IST,9198216360,ss329222@gmail.com,7.4,6.7,82.09,0,NIL
1193,RA1511008020181,Soumyajyoti dutta,B.Tech,IT,SRM IST,8902292248,soumyajyoti.sd@gmail.com,7.8,63.0,82.42,0,NIL
1194,RA1511008020144,Ambica Gupta,B.Tech,IT,SRM IST,9840375623,deepaguptarke@gmail.com,8.2,62.0,90.86,0,NIL
1195,RA1511008020075,Mayank Kumar,B.Tech,IT,SRM IST,7004219751,mayankkumar4ever@gmail.com,8.4,73.0,82.37,0,NIL
1196,RA1511008020134,Varsha Singh,B.Tech,IT,SRM IST,9840373128,singhvarsha1342019@gmail.com,8.4,73.0,86.75,0,NIL
1197,RA1511008020128,R.Meera Ranjani,B.Tech,IT,SRM IST,9840377842,rmeeraranjani@gmail.com,8.4,86.0,82.18,0,NIL
1198,RA1511008020023,Rishabh Pathak,B.Tech,IT,SRM IST,9840341156,rishabh97able@gmail.com,8.8,63.0,81.93,0,NIL
1199,RA1511008020038,Shraddha Bajaj,B.Tech,IT,SRM IST,7358559989,shraddhabajaj1997@gmail.com,8.8,73.0,90.47,0,NIL
1200,RA1511008020150,Richa Sharma,B.Tech,IT,SRM IST,9840331064,richasharma5912@gmail.com,8.8,76.0,84.39,0,NIL


In [11]:
data_IT.columns

Index(['Register Number', 'Student Name', 'Degree', 'Branch', 'Campus',
       'Mobile Number', 'E mail ID', '10 th %', '12 th / Diploma %', 'UG %',
       'Number of Backlogs', 'No. of History of Arrears '],
      dtype='object')

## Feature Selection

I'll use the input features as marks in class 10<sup>th</sup> and 12<sup>th</sup> to predict student's score in Undergraduate studies.

In [12]:
X = data_IT[data_IT.columns[7:9]].values

In [13]:
y = data_IT[data_IT.columns[9]].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.10, random_state=42)

In [15]:
X.shape

(387, 2)

In [16]:
y.shape

(387,)

In [22]:
from keras.layers import Dense, Input
from keras.models import Model

from sklearn.preprocessing import OneHotEncoder

In [63]:
in_layer = Input((2,))
d1 = Dense(200, activation="sigmoid", name="layer_1")(in_layer)
d2 = Dense(100, activation="sigmoid", name="layer_2")(d1)
d3 = Dense(10, activation="sigmoid", name="layer_3")(d2)

In [64]:
model = Model(in_layer, d3)

In [65]:
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 2)                 0         
_________________________________________________________________
layer_1 (Dense)              (None, 200)               600       
_________________________________________________________________
layer_2 (Dense)              (None, 100)               20100     
_________________________________________________________________
layer_3 (Dense)              (None, 10)                1010      
Total params: 21,710
Trainable params: 21,710
Non-trainable params: 0
_________________________________________________________________


In [66]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [67]:
ohe = OneHotEncoder(categories="auto")

In [68]:
y_hot = ohe.fit_transform(y.reshape(-1, 1))

In [69]:
y_oe = (y % 2)

In [70]:
model.fit(X, y_hot, epochs=5)

ValueError: Error when checking target: expected layer_3 to have shape (10,) but got array with shape (313,)