# Linear Regression

## Part 1 - Data Preprocessing

### Importing the dataset

In [125]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('Admission_Predict.csv')

In [126]:
dataset.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


### Check Duplicated

In [127]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         400 non-null    int64  
 1   GRE Score          400 non-null    int64  
 2   TOEFL Score        400 non-null    int64  
 3   University Rating  400 non-null    int64  
 4   SOP                400 non-null    float64
 5   LOR                400 non-null    float64
 6   CGPA               400 non-null    float64
 7   Research           400 non-null    int64  
 8   Chance of Admit    400 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 28.3 KB


In [128]:
dataset.duplicated().sum()

np.int64(0)

### Check Null Values

In [129]:
dataset.isnull().sum()

Serial No.           0
GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

### Getting the inputs and output

In [130]:
X = dataset.iloc[:,:-1].values

In [131]:
X

array([[  1.  , 337.  , 118.  , ...,   4.5 ,   9.65,   1.  ],
       [  2.  , 324.  , 107.  , ...,   4.5 ,   8.87,   1.  ],
       [  3.  , 316.  , 104.  , ...,   3.5 ,   8.  ,   1.  ],
       ...,
       [398.  , 330.  , 116.  , ...,   4.5 ,   9.45,   1.  ],
       [399.  , 312.  , 103.  , ...,   4.  ,   8.78,   0.  ],
       [400.  , 333.  , 117.  , ...,   4.  ,   9.66,   1.  ]],
      shape=(400, 8))

In [132]:
y = dataset.iloc[:,-1]

In [133]:
y

0      0.92
1      0.76
2      0.72
3      0.80
4      0.65
       ... 
395    0.82
396    0.84
397    0.91
398    0.67
399    0.95
Name: Chance of Admit , Length: 400, dtype: float64

### Creating the Training Set and the Test Set

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Normalazation Data 

In [135]:
from sklearn.preprocessing import MinMaxScaler
min_max  = MinMaxScaler()
X_train = min_max.fit_transform(X_train)
X_test = min_max.transform(X_test)

In [136]:
X_train

array([[0.00502513, 0.64      , 0.64285714, ..., 0.375     , 0.59935897,
        1.        ],
       [0.04271357, 0.56      , 0.64285714, ..., 0.5       , 0.64102564,
        0.        ],
       [0.50502513, 1.        , 1.        , ..., 0.875     , 0.99679487,
        1.        ],
       ...,
       [0.6758794 , 0.32      , 0.46428571, ..., 0.5       , 0.45512821,
        1.        ],
       [0.8718593 , 0.24      , 0.25      , ..., 0.25      , 0.14423077,
        0.        ],
       [0.25376884, 0.48      , 0.5       , ..., 0.625     , 0.46474359,
        0.        ]], shape=(320, 8))

In [137]:
X_test

array([[ 0.52261307,  0.22      ,  0.42857143,  0.5       ,  0.625     ,
         0.75      ,  0.42307692,  1.        ],
       [ 0.70100503,  0.42      ,  0.35714286,  0.5       ,  0.875     ,
         0.75      ,  0.58974359,  1.        ],
       [ 0.08040201,  1.        ,  0.78571429,  1.        ,  0.75      ,
         0.75      ,  0.8974359 ,  1.        ],
       [ 0.52512563,  0.7       ,  0.57142857,  0.75      ,  0.875     ,
         0.75      ,  0.72435897,  1.        ],
       [ 0.23115578,  0.22      ,  0.17857143,  0.25      ,  0.5       ,
         0.5       ,  0.34615385,  1.        ],
       [ 0.20854271,  1.        ,  0.82142857,  1.        ,  0.875     ,
         0.875     ,  0.84935897,  1.        ],
       [ 0.8241206 ,  0.14      ,  0.14285714,  0.25      ,  0.375     ,
         0.125     ,  0.34935897,  0.        ],
       [ 0.23366834,  0.26      ,  0.25      ,  0.5       ,  0.25      ,
         0.375     ,  0.27564103,  0.        ],
       [ 0.66582915,  0.44      

## Part 2 - Building and training the model

### Building the model

In [138]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the model

In [139]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Inference

Making the predictions of the data points in the test set

In [140]:
y_pred = model.predict(X_test)

In [141]:
y_pred

array([0.65574435, 0.73430228, 0.90436809, 0.82259969, 0.55788233,
       0.91160429, 0.54264953, 0.52628146, 0.66340936, 0.81851662,
       0.68593422, 0.93401326, 0.53815533, 0.86728918, 0.69609912,
       0.65125774, 0.67311052, 0.51406476, 0.69792256, 1.00910305,
       0.55736174, 0.62087043, 0.74014935, 0.53946085, 0.91009173,
       0.83559979, 0.67767011, 0.56291224, 0.68639694, 0.78926028,
       0.83817553, 0.95031015, 0.66204304, 0.48865961, 0.66414965,
       0.64822431, 0.70045985, 0.673208  , 0.60509982, 0.85898307,
       0.77063988, 0.58089016, 0.75263348, 0.93347497, 0.83261751,
       0.86772458, 0.93092666, 0.63943982, 0.89463172, 0.85382072,
       0.872162  , 0.75161614, 0.82011878, 0.9320531 , 0.60990902,
       0.56969517, 0.68880011, 0.83945734, 0.56622967, 0.84683954,
       0.65260813, 0.66365908, 0.72015857, 0.48817044, 0.6102768 ,
       0.68964218, 0.64532972, 0.85752904, 0.89112601, 0.77430894,
       0.74441559, 0.79456207, 0.84202392, 0.80971924, 0.55237

Making the prediction of a single data point with AT = 15, V = 40, AP = 1000, RH = 75

## Part 3: Evaluating the model

### intercept (c)

In [142]:
model.intercept_

np.float64(0.32588992548355555)

### coef(M)

In [143]:
model.coef_

array([0.07198146, 0.09791145, 0.09937819, 0.04328157, 0.00777268,
       0.06629466, 0.30773998, 0.02168741])

### R-Score

In [144]:
model.score(X_train,y_train)

0.8157123222604523

In [145]:
model.score(X_test,y_test)

0.8212241793299222

### R-Squared

In [146]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)

In [147]:
r2

0.8212241793299222

### Adjusted R-Squared

In [148]:
k = X_test.shape[1]
n = X_test.shape[0]
adjusted_r2 = 1 - (1-r2) * (n-1)/(n-k-1)

In [149]:
adjusted_r2

0.8010804248882233