# model training

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# read dataset
df=pd.read_excel("student_CGPA_prediction_dataset (1).xlsx")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.isnull().sum()             # dataset has no null values

In [None]:
# Encoding
df['Grade']=np.where(df['Grade'].str.contains('fail'),0,1)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df['Grade'].value_counts()

In [None]:
# seperating among dependent ad independent features
x=df.drop(['Total','Percentage','Grade','CGPA'],axis=1)
y=df['CGPA']

In [None]:
x.head()

In [None]:
y

In [None]:
# train test split
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.20, random_state=42)

In [None]:
X_train.shape,X_test.shape

## Feature selection based on correlation

In [None]:
X_train.corr()

## feature selection

In [None]:
# cheack for multicollinearity
plt.figure(figsize=(12,8))
corr=X_train.corr()
sns.heatmap(corr,annot=True)

In [None]:
# here we are doing feature selection based on co-relation if there corr is more than threshold then we try to delete that feature and and select only imp features


In [None]:
X_train.corr()

In [None]:
def correlation(dataset,threshold):
    col_corr=set()
    corr_matrix=dataset.corr()
    for i in range(len(dataset.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j])>threshold:
                col_name=corr_matrix.columns[i]
                col_corr.add(col_name)
    return col_corr

In [None]:
## threshold --is decieded by domain expertise
corr_features=correlation(X_train,0.75)

In [None]:
corr_features

In [None]:
# here we can see that there is no multicollinearity among all input features that we have that is our subject names 
# So we are getting empty set
# it seems that there is no such features that are more co-related
# So we will that all features for training our model

In [None]:
# So as we are taking all features in dataset for training so we can't drop any feature

## Feature scalling or standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_scl=scaler.fit_transform(X_train)
x_test_scl=scaler.transform(X_test)

In [None]:
x_train_scl

## Box plots to understand effects of standard scaler

In [None]:
plt.subplots(figsize=(15,5))
plt.subplot(1,2,1)
sns.boxplot(data=X_train)
plt.title("X_train before scalling")
plt.subplot(1,2,2)
sns.boxplot(data=x_train_scl)
plt.title('X_train after scalling')

## Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
legreg=LinearRegression()
legreg.fit(x_train_scl,y_train)
y_pred_test=legreg.predict(x_test_scl)
mae=mean_absolute_error(y_test,y_pred_test)
score=r2_score(y_test,y_pred_test)
print("Mean Absolute error:",mae)
print("r2 Score:",score)

## Lasso Regression

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
lasso=Lasso()
lasso.fit(x_train_scl,y_train)
y_pred_test=lasso.predict(x_test_scl)
mae=mean_absolute_error(y_test,y_pred_test)
score=r2_score(y_test,y_pred_test)
print("Mean Absolute error:",mae)
print("r2 Score:",score)

## Ridge regression model

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
ridge=Ridge()
ridge.fit(x_train_scl,y_train)
y_pred_test=ridge.predict(x_test_scl)
mae=mean_absolute_error(y_test,y_pred_test)
score=r2_score(y_test,y_pred_test)
print("Mean Absolute error:",mae)
print("r2 score",score)

## Elasticnet regression

In [None]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
elastic=ElasticNet()
elastic.fit(x_train_scl,y_train)
y_pred_test=elastic.predict(x_test_scl)
mae=mean_absolute_error(y_test,y_pred_test)
score=r2_score(y_test,y_pred_test)
print("Mean Absolute error:",mae)
print("r2 score",score)

In [None]:
## making pickel file of our ridge model

In [None]:
# ridge model is seleted instead of linear as linear model may leads to overfiting

In [None]:
import pickle
pickle.dump(scaler,open("student_CGPA_prediction_project_scaler_model.pkl",'wb'))
pickle.dump(ridge,open("student_CGPA_prediction_project_ridge_model.pkl",'wb'))