In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
data = pd.read_csv(r'../input/student-grade-prediction/student-mat.csv')

In [None]:
data.head().transpose()

In [None]:
cardinality = {'columns' : data.columns,
               'cardinal' : []}

In [None]:
for i in cardinality['columns']:
    cardinality['cardinal'].append(data[i].nunique())

In [None]:
len(data)

MISSING VALUES

In [None]:
missingno.matrix(data)

NO MISSING VALUES !

### EDA

1. Dependent Variable

In [None]:
plt.figure(figsize = (10,10))
data['G3'].value_counts().sort_values().plot(kind = 'barh', width = 0.8, color = sns.color_palette("RdBu", 40))

In [None]:
b = sns.countplot(data['G3'])
b.set_xlabel('Final Grade')
b.set_ylabel('Count')

there seems to be an unusually high number of 0's, this could mean NaN in this case but as we saw before there were 0 null values thus lets ignore this

#### Corelation Matrix

In [None]:
corr = data.corr()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(corr, cmap = 'coolwarm', annot = True)

Lets see the Distribution of Age Variable, and its relation with gender

In [None]:
sns.distplot(data['age'], kde= False, color = 'r')

In [None]:
sns.kdeplot(data['age'], shade = True, color = 'r')

In [None]:
plt.figure(figsize = (7,5))
sns.countplot(data['age'], hue = data['sex'])

In [None]:
data['sex'].unique()

Lets see relation between sex and grades

In [None]:
sns.kdeplot(data.loc[data['sex'] == 'F', 'G3'], label='Female', shade = True)
sns.kdeplot(data.loc[data['sex'] == 'M', 'G3'], label='Male', shade = True)
plt.title('Does gender affect your graders?', fontsize = 20)
plt.show()

Lets see how relationships affect grades

In [None]:
sns.kdeplot(data.loc[data['romantic'] == 'yes', 'G3'], label='Relationship', shade = True)
sns.kdeplot(data.loc[data['romantic'] == 'no', 'G3'], label='Single', shade = True)
plt.title('Does relationship affect studies?', fontsize = 20)
plt.show()

In [None]:
sns.kdeplot(data.loc[data['address'] == 'U', 'G3'], label='Urban', shade = True)
sns.kdeplot(data.loc[data['address'] == 'R', 'G3'], label='Rural', shade = True)
plt.title('Do urban students score higher than rural students?', fontsize = 20)
plt.xlabel('Grade', fontsize = 20);
plt.ylabel('Density', fontsize = 20)
plt.show()

In [None]:
sns.kdeplot(data.loc[data['address'] == 'U', 'age'], label='Urban', shade = True)
sns.kdeplot(data.loc[data['address'] == 'R', 'age'], label='Rural', shade = True)
plt.title('Do urban students attend more years of school?', fontsize = 20)
plt.show()

In [None]:
copy_set = data.copy()

### LABEL ENCODING:


In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [None]:
binary_cols = []

for col in data.columns:
    if data[col].nunique()==2:
        binary_cols.append(col)

In [None]:
for i in binary_cols:
    data[i] = LabelBinarizer().fit_transform(data[i])

In [None]:
data.head().transpose()

In [None]:
categorical_cols = [column for column in data.columns if (data[column].nunique()<=30)]

In [None]:
str_col = [col for col in categorical_cols if data[col].dtype =='O']

In [None]:
str_col

In [None]:
for i in str_col:
    print(i, ' :', data[i].unique())

We can one hot encode these features

In [None]:
data_ = data.copy()

In [None]:
for i in str_col:
    print(i)
    data = pd.concat([data.drop(i, axis = 1), pd.get_dummies(data[i], prefix=i, drop_first = True)], axis = 1)

### Classification - 
    - If the G3 score is greater than 10, the student has passed else the student has failed

In [None]:
data['pass'] = data['G3'].copy()

In [None]:
def classify(x):
    if x >= 10:
        return 1
    else:
        return 0

data['pass'] = data['pass'].apply(classify)

In [None]:
data_clf = data.drop(['G1', 'G2', 'G3'], axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_clf.drop('pass', 1), data_clf['pass'], random_state = 42, test_size = 0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm  import SVC
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.linear_model import LogisticRegression

In [None]:
scores_df = {'name': [],
             'train_score': [],
             'test_score' : []}

In [None]:
def base_score(model_info):
    
    model, name = model_info
    model.fit(X_train, y_train)
    scores_df['name'].append(name)
    scores_df['train_score'].append( model.score(X_train, y_train))
    scores_df['test_score'].append(model.score(X_test, y_test))

In [None]:
models = [(RandomForestClassifier(),'rf'), (GradientBoostingClassifier(), 'gbc'), (LogisticRegression(), 'lr'),
          (BernoulliNB(), 'naive_b'), (GaussianNB(), 'naive_g'), (SVC(), 'svc'), (xgb.XGBClassifier(), 'xgb')]

In [None]:
for i in models:
    base_score(i)

In [None]:
scores_df = pd.DataFrame(scores_df)
scores_df.set_index('name', inplace = True)
scores_df

In [None]:
from sklearn.metrics import classification_report, accuracy_score

### HyperParameter Optimization :

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
model_rf = RandomForestClassifier()
model_gbc = GradientBoostingClassifier()
model_xgb = xgb.XGBClassifier()
model_lr = LogisticRegression()

In [None]:
params = {
    model_rf: 
    {
        'n_estimators' : np.arange(10,100,10),
        'max_features' : [0.2, 0.5, 1],
        'max_depth' : [2,3,5,7],
    },
    model_gbc: 
    {
        'n_estimators' : np.arange(10,100,10),
        'learning_rate' : np.arange(0.01, 0.05, 1),
        'subsample' : [0.2, 0.5, 0.8, 1],
        'max_depth' : [2, 3, 5]
        
    },
    model_xgb:
    {
        'max_depth' : [2, 3, 5],
        'subsample' : [0.2, 0.5, 1],
        'n_estimators' : np.arange(40,150,10),
        'learning_rate': np.arange(0.01, 0.5, 1),
    },
    model_lr:
    {
        'penalty': ['l2', 'l1'],
        'C': np.arange(0.1, 1, 0.1),
    }
         }

In [None]:
best_estimators = []

In [None]:
for model in params.keys():
    clf = RandomizedSearchCV(model, params[model], cv = 3, n_jobs = -1, random_state = 42)
    search = clf.fit(X_train, y_train)
    best_estimators.append(search.best_estimator_)

In [None]:
def scoring(estimator):
    estimator.fit(X_train, y_train)
    print(estimator.score(X_test, y_test))
    
for estimator in best_estimators:
    print(estimator)
    scoring(estimator)

Thus we can conclude that Logistic Regression performs the best amongst the given

### ENSEMBLE

In [None]:
model_rf = RandomForestClassifier(max_depth=5, max_features=0.5, n_estimators=50)

In [None]:
model_lr = LogisticRegression(C=0.1)

In [None]:
model_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain', interaction_constraints='',
                              learning_rate=0.01, max_delta_step=0, max_depth=5,
                              min_child_weight=1, monotone_constraints='()',
                              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.2,
                              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
model_rf.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_lr.fit(X_train, y_train)

In [None]:
def Voting(data):
    
    preds_1 = np.array(model_lr.predict(data))
    preds_2 = np.array(model_xgb.predict(data))
    preds_3 = np.array(model_rf.predict(data))
    
    pred = preds_1 + preds_2 + preds_3
    prediction = []
    
    for i in pred:
        if i<=1.5:
            prediction.append(0)
        elif i>=1.5:
            prediction.append(1)
    
    return np.array(prediction)

In [None]:
preds = Voting(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

The ensemble model works better than each of the induvidual classifiers!!

### MODEL INTERPRETATION :

In [None]:
rf_imps = model_rf.feature_importances_

In [None]:
xgb_imps = model_xgb.feature_importances_

In [None]:
cols = X_train.columns

In [None]:
df = {'columns' : cols,
      'rf_imp': rf_imps,
      'xgb_imps': xgb_imps}

In [None]:
df = pd.DataFrame(df)

In [None]:
df['mean_importance'] = (df['rf_imp'] + df['xgb_imps'])/2

In [None]:
df = df.sort_values(by=['mean_importance'], ascending = False)

In [None]:
df_copy = df.copy()

Lets take only the 18 most important features and see if our model performs better

In [None]:
num_cols = 15

In [None]:
df = df_copy.copy()

In [None]:
df = df.head(num_cols)

In [None]:
df.shape

In [None]:
columns = df['columns']

In [None]:
X_train_2 = X_train[columns]

In [None]:
X_test_2 = X_test[columns]

In [None]:
model_rf = RandomForestClassifier(max_depth=5, max_features=0.5, n_estimators=50)

In [None]:
model_lr = LogisticRegression(C=0.1)

In [None]:
model_xgb = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
                              importance_type='gain', interaction_constraints='',
                              learning_rate=0.01, max_delta_step=0, max_depth=5,
                              min_child_weight=1, monotone_constraints='()',
                              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.2,
                              tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
X_train_2.shape

In [None]:
model_rf.fit(X_train_2, y_train)
model_xgb.fit(X_train_2, y_train)
model_lr.fit(X_train_2, y_train)

In [None]:
preds = Voting(X_test_2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, preds)

This is close enough to the accuracy we got with 39 features

### MOST IMPORTANT FEATURES IN DETERMINING IF A STUDENT PASSES ARE: 
    - 'failures', 'higher', 'absences', 'goout', 'age', 'schoolsup', 'Walc',
    - 'Medu', 'Fjob_teacher', 'studytime', 'traveltime', 'reason_reputation',
    - 'freetime', 'Dalc', 'Mjob_services'