# Import Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score


from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier



plt.rcParams['figure.figsize'] = (10,6)


# Load Dataset

In [None]:
#mat = pd.read_csv('/kaggle/input/student-alcohol-consumption/student-mat.csv')
data =pd.read_csv('/kaggle/input/student-alcohol-consumption/student-por.csv')

In [None]:
data.head()

Lets know about dataset columns
- school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
- sex - student's sex (binary: 'F' - female or 'M' - male)
- age - student's age (numeric: from 15 to 22)
- address - student's home address type (binary: 'U' - urban or 'R' - rural)
- famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
- Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
- Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
- Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary education or 4 – higher education)
- Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
- reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
- guardian - student's guardian (nominal: 'mother', 'father' or 'other')
- traveltime - home to school travel time (numeric: 1 - 1 hour)
- studytime - weekly study time (numeric: 1 - 10 hours)
- failures - number of past class failures (numeric: n if 1<=n<3, else 4)
- schoolsup - extra educational support (binary: yes or no)
- famsup - family educational support (binary: yes or no)
- paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
- activities - extra-curricular activities (binary: yes or no)
- nursery - attended nursery school (binary: yes or no)
- higher - wants to take higher education (binary: yes or no)
- internet - Internet access at home (binary: yes or no)
- romantic - with a romantic relationship (binary: yes or no)
- famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
- freetime - free time after school (numeric: from 1 - very low to 5 - very high)
- goout - going out with friends (numeric: from 1 - very low to 5 - very high)
- Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
- Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
- health - current health status (numeric: from 1 - very bad to 5 - very good)
- absences - number of school absences (numeric: from 0 to 93)

These grades are related with the course Portuguese:
- G1 - first period grade (numeric: from 0 to 20)
- G2 - second period grade (numeric: from 0 to 20)
- G3 - final grade (numeric: from 0 to 20, output target)

# Preprocessing

In [None]:
data.isnull().sum()

**In our dataset we have 0 Null values**

In [None]:
data.info()

**Most of columns are of object then others are of int64 datatypes**

In [None]:
# Describe data Statistically(only numerical values and continous values)
data.describe().T.style.bar(subset=['mean'])\
                            .background_gradient(subset=['std'])\
                            .background_gradient(subset=['50%'])\
                            .background_gradient(subset=['max'])

**we observe following Points:**
1. Age having heighest mean value with lead in 50% values.
2. Absences having heighest std value that mean absence having more spread in distribution.
3. absence also having maximum value.

In [None]:
# create Grand Total by adding all grades
data['G_Total'] = data['G1']+data['G2']+data['G3']

In [None]:
# combine weekend consumption with workday consumption
data['Dalc'] = data['Dalc']+data['Walc']

In [None]:
# for finding unique values
data['Dalc'].unique()

# EDA

In [None]:
# lets seperate the categorical variable and numerical variables
cat_col = [x for x in data.columns if data[x].dtypes=='O']
num_col = [x for x in data.columns if x not in cat_col]


In [None]:
# Target Variable
plt.figure(figsize=(12,6))
sns.countplot(x='G_Total', data=data,
                   facecolor=(0,0,0,0),
                   linewidth=5,
                   edgecolor=sns.color_palette("dark", 10))


In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=data,x='age',hue='Dalc')
plt.tight_layout()
plt.show()

**Mostly age between 15-19 are more engaged in daily consumption of alcohol but they are in initial drinking stage**

**Age group between 15-19 are more interested in drinking but there are everytype of drinkers in this age.mostly the beginner one**

In [None]:
# lets examine various features together
plt.figure(figsize=(30,30))
feature = [x for x in num_col if 'G_Total' not in x]
for i in enumerate(feature):
    plt.subplot(5,4,i[0]+1)
    sns.countplot(i[1],hue='Dalc',data=data)
    plt.title(i[1]+' vs workday alcohol Consumption (Dalc)')
    plt.xticks(rotation=45)

**We can Observe the following things:**
- Most of the drinker belongs to age group 15-18 but the consumption of alcohol is not very much.
- Normally there is not any special pattern in Medu and Fedu but still there is very less consumption of alcohol with 0 value of Medu and Fedu.There is still lot to find in Medu and Fedu.
- Those who live far from school takes less alcohol consumption that means if traveltime decreases,alcohol consumption increases.
- Studytime: It is not confirmed but more study time means less alcohol consumption.
- failures: failures shows us that 0 failures having heighest consumption.
- absence: lesser absence have direct relation to highest consumption



**But still lot to analyse**




In [None]:
# school vs alcohol consumption on workday
sns.countplot(data=data,x='school',hue='Dalc')

**So alcohol consumption on workday is more in GP comparible to MS and they most of are level 1 consumer of alcohol**

In [None]:
# female vs alcohol consumption on weekends
sns.countplot(data=data,x='sex',hue='Dalc',palette='Set3')

In [None]:
# Countplot provide us the count values
sns.countplot('Dalc',data=data,palette='winter')

**There is no student who does not consume alcohol. However, all students at least 2 times in a week consume alcohol**

In [None]:
#grade vs Alcohol consumption
sns.catplot(y="G_Total", x="Dalc",hue='sex', kind="swarm", data=data)

**There are very less student who drinks more than 8 times in a week and most of them are males**

In [None]:
# alcohol consumption vs grades(under average or above average)
average=data['G_Total'].mean()
data['average'] = ['under average'if i < average else 'above average' for i in data.G_Total]
sns.swarmplot(x='Dalc',y='G_Total',hue='average',data=data,palette={'above average':'Red','under average':'green'})

**As the level of consumption increases the average grade decreases.that means student who takes highest grade consumes alcohol only 2 times in a week.**


**Let's try to find more information about grades and consumption**

In [None]:
# Dalc vs G_Total on gender
sns.catplot(x='Dalc',y='G_Total',hue='school',col='sex',data=data,kind='bar')

**Here we clearly see that the student of school GP are more engage in alcohol consumption than MS and we can also see that males are little more number than female**

In [None]:
#countplot for romantic vs Dalc
sns.catplot(x='romantic',hue='Dalc',data=data,kind='count')

**As we see romantic relation have an impact on alcohol consumption.Those who are not in a Relationship,consumes more alcohol then those who are in relationship**

**Let's check what is the affect of relationship on Alcohol consumption and Total grade**

In [None]:

sns.catplot(x='Dalc',y='G_Total',col='romantic',data=data,kind='bar',palette='summer')

**Those who are in relationship and also consume alcohol doesn't much impact on their Grades**

In [None]:
plt.figure(figsize=(30,30))
for i in enumerate(cat_col):
    plt.subplot(5,4,i[0]+1)
    sns.countplot(x=i[1],hue='Dalc',data=data,palette='nipy_spectral')
    
    

**From the above figures we observe following things:**
1. A large number of females takes 2 times alcohol consumption in a week while there are some males who consme 10 times alcohol in a week.
2. Students from Urban places or cities consume more alcohol than Rural students.
3. Those students whose father and mother are Apart from each other having very low amount of consumption comparable to students whose parents lives together.
4. Guardian of students also impact on their drink status.Students having guardian other than mother and father,their alcohol consumption also very low.
5. still lot to search


In [None]:
GP = data[data.school == 'GP']
MS = data[data.school == 'MS']

fig,ax=plt.subplots(1,2,figsize=(12,6))
sns.kdeplot(GP.G_Total,label="GP",ax=ax[0])
sns.kdeplot(MS.G_Total,label="MS",ax=ax[0])

sns.kdeplot(GP.Dalc,label='GP',ax=ax[1],)
sns.kdeplot(MS.Dalc,label='MS',ax=ax[1])
ax[0].set_ylabel('')
ax[1].set_ylabel('')

plt.show()

- **students of GP are more good than student of MS in overall grades**
- **In right hand figure we can see students of GP are also good in alcohol consumption compare to the students of MS**

In [None]:
# count of Pstatus
sns.countplot(x='Pstatus',data=data)

- **We see there are the heighr number of T then A that means there are less then 100 students whose parents are far from each other**

In [None]:
#g_Total vs Pstatus

sns.catplot(x='Pstatus',y='G_Total',data=data)

- **In Grades, Students with Pstatus 'T' are more good**

In [None]:
#guardian vs G_total
sns.catplot(x='guardian',y='G_Total',data=data)

**Students with mother as Guradian shows good result in grades**

In [None]:
#guardian vs G_total with Dalc
sns.catplot(x='Dalc',y='G_Total',hue='guardian',data=data)

- **students having their mother as guardian,they perform well in grades but they also consume alcohol on large scale**
- **students having their mother as guardian,they are on every level of alcohol consumption**

In [None]:
sns.catplot(x='internet',y='G_Total',data=data,kind='swarm')

**Students with Internet Performs very well than student with no internet**


In [None]:
sns.barplot(x='internet',y='G_Total',data=data,hue='romantic')

In [None]:
sns.catplot(x='internet',y='G_Total',data=data,hue='Dalc',kind='bar')

In [None]:
fig,ax=plt.subplots(1,3,figsize=(20,5))
sns.distplot(data['G1'],ax=ax[0])
sns.distplot(data['G_Total'],ax=ax[0],color='red')
ax[0].set_xlabel("G1 vs G_Total")

sns.distplot(data['G2'],ax=ax[1])
sns.distplot(data['G_Total'],ax=ax[1],color='green')
ax[1].set_xlabel("G2 vs G_Total")

sns.distplot(data['G3'],ax=ax[2])
sns.distplot(data['G_Total'],ax=ax[2])
ax[2].set_xlabel("G3 vs G_Total")
plt.show()

sns.distplot(data['G_Total'],color='violet',fit=norm)

In [None]:
plt.figure(figsize=(16,6))
sns.heatmap(data.corr(),annot=True,cmap='cividis')

# Feature Engineering

In [None]:
data['romantic']=np.where(data['romantic'].values=='yes',1,0)
data['internet']=np.where(data['internet'].values=='yes',1,0)
data['higher']=np.where(data['higher'].values=='yes',1,0)
data['nursery']=np.where(data['nursery'].values=='yes',1,0)
data['activities']=np.where(data['activities'].values=='yes',1,0)
data['paid']=np.where(data['paid'].values=='yes',1,0)
data['famsup']=np.where(data['famsup'].values=='yes',1,0)
data['schoolsup']=np.where(data['schoolsup'].values=='yes',1,0)

In [None]:
data['school']=np.where(data['school'].values=='GP',1,0)
data['sex']=np.where(data['sex'].values=='M',1,0)
data['address']=np.where(data['address'].values=='U',1,0)
data['famsize']=np.where(data['famsize'].values=='GT3',1,0)
data['Pstatus']=np.where(data['Pstatus'].values=='A',1,0)

In [None]:
data['health']

### One-Hot Encoding

In [None]:
Mjob=pd.get_dummies(data['Mjob'],prefix='M_job',drop_first=True)
Fjob=pd.get_dummies(data['Fjob'],prefix='F_job',drop_first=True)
reason = pd.get_dummies(data['reason'],prefix='reason',drop_first=True)
guardian = pd.get_dummies(data['guardian'],prefix='guardian',drop_first=True)

In [None]:
Mjob.head()

In [None]:
data.drop(['Mjob','Fjob','reason','guardian','G1','G2','G3','average'],axis=1,inplace=True)

In [None]:
df_1 = pd.concat([data,Mjob,Fjob,reason,guardian],axis=1)
df_1.head()

In [None]:
# If you want to Convert your data into 3 categories

#data['G_Total']=np.where(data['G_Total'].between(1,21),1,data['G_Total'])
#data['G_Total']=np.where(data['G_Total'].between(21,40),2,data['G_Total'])
#data['G_Total']=np.where(data['G_Total'] >39,3,data['G_Total'])
#data['G_Total'].plot.hist()
#data['G_Total'].value_counts()

In [None]:
X=df_1.drop('G_Total',axis=1)
y=df_1['G_Total']

In [None]:
print(X.shape)
print(y.shape)


# Data Split
**Here we see we have very few rows and high number of columns we use various techniques to split the data**

In [None]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=20)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Models
**There are lots of columns in our dataset so either we use dimensional reduction or we can go with specific classification algorithms which robust to high dimensionality and multicollinearity also**


**Amazed!  what are these heavy words so let me explain for you:**
- ***Dimensional Reduction :*It is the transformation of data from a high-dimensional space into a low-dimensional space so that the low-dimensional representation retains some meaningful properties of the original data.**

- ***High Dimensionality :* It means that the number of dimensions are staggeringly high — so high that calculations become extremely difficult. With high dimensional data, the number of features can exceed the number of observations.**

- ***Multicollinearity :* It exists whenever an independent variable is highly correlated with one or more of the other independent variables in a multiple regression equation. Multicollinearity is a problem because it undermines the statistical significance of an independent variable.**




**NOTE :**
- ***Curse of Dimensionality :* Curse of Dimensionality refers to a set of problems that arise when working with high-dimensional data**

**MODELS :**
- **Decision Tree.**
- **SVC**
- **Random Forest.**
- **Adaboost.**
- **CatBoost.**
- **XGB classifier**

In [None]:
models=[]
names=['Decision Tree','SVC','Random Forest','Adaboost','XGB classifier']
cv_models=[]
# Decision Tree
dt=[]
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt_pred=dt.predict(X_test)
models.append(accuracy_score(y_test,dt_pred))


cv_models.append(cross_val_score(dt,X,y,cv=5).mean())


In [None]:
#svm
svc=SVC()
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)
models.append(accuracy_score(y_test,svc_pred).mean())
cv_models.append(cross_val_score(svc,X,y,cv=5).mean())

In [None]:
#Random Forest
rf =RandomForestClassifier()
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
models.append(accuracy_score(y_test,rf_pred))
cv_models.append(cross_val_score(rf,X,y,cv=5).mean())

In [None]:
#AdaBoost
ab = AdaBoostClassifier()
ab.fit(X_train,y_train)
ab_pred = ab.predict(X_test)
models.append(accuracy_score(y_test,ab_pred))
cv_models.append(cross_val_score(ab,X,y,cv=5).mean())
print(cross_val_score(ab,X,y,cv=5).mean())

In [None]:
xgb =XGBClassifier()
xgb.fit(X_train,y_train)
xgb_pred =xgb.predict(X_test)
models.append(accuracy_score(y_test,xgb_pred))
cv_models.append(cross_val_score(xgb,X,y,cv=5).mean())
print(cross_val_score(xgb,X,y,cv=5).mean())

In [None]:
cv_models=[i*1000 for i in cv_models]
models = [i*1000 for i in models]

In [None]:
# final models dataset
final_df=pd.DataFrame({'Model_names':names,'Train_test_split_score in %':models,'CV_Score in %':cv_models})
final_df

# Future Updates:
- **Hyperparameter Optimization of models.**
- **Use other classification algorithms.**
- **Use of Dimensional Reduction techniques**

**I hope you enjoy this Notebook.If you like please give Upvote**