In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings 
warnings.filterwarnings('ignore')

In [None]:
#Reading file 
df=pd.read_csv('../input/predict-test-scores-of-students/test_scores.csv')
df.head()

In [None]:
#Dropping useless columns
df.drop(['classroom','student_id'],axis=1,inplace=True)

# EXPLORATORY DATA ANALYSIS

In [None]:
colors_blue = ["#132C33", "#264D58", '#17869E', '#51C4D3', '#B4DBE9']
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']
sns.palplot(colors_blue)
sns.palplot(colors_green)
sns.palplot(colors_dark)

### GENDER

In [None]:
fig=px.pie(df,values=df.gender.value_counts(),names=['Male','Female'],color_discrete_sequence=[colors_green[3],colors_blue[3]],hole=0.6)
fig.update_traces(textposition='outside',textinfo='percent+label')
fig.add_annotation(text='Gender',x=0.5,y=0.5,showarrow=False)
fig.update_layout(font_family='monospace',title=dict(text='Gender Distribution',x=0.5,y=0.98,font=dict(size=20,color=colors_dark[2])))

### SCHOOL TYPE

In [None]:
fig=px.pie(df,values=df['school_type'].value_counts(),names=['Public','Non Public'],hole=0.6,color_discrete_sequence=[colors_green[3],colors_blue[3]])
fig.add_annotation(text='School Type',x=0.5,y=0.5,showarrow=False)
fig.update_traces(textposition='outside',textinfo='percent+label')
fig.update_layout(font_family='monospace',title=dict(text='School Type Distribution',x=0.5,y=0.98,font=dict(color=colors_dark[2],size=20)))

### TEACHING METHOD

In [None]:
fig=px.pie(df,values=df['teaching_method'].value_counts(),names=df.teaching_method.value_counts().index,hole=0.6,color_discrete_sequence=[colors_green[3],colors_blue[3]])
fig.add_annotation(text='Teaching Method',x=0.5,y=0.5,showarrow=False)
fig.update_traces(textposition='outside',textinfo='percent+label')
fig.update_layout(font_family='monospace',title=dict(text='Teaching Method Distributuion',x=0.5,y=0.98,font=dict(color=colors_dark[2],size=20)))

### LUNCH

In [None]:
fig=px.pie(df,values=df['lunch'].value_counts(),names=df.lunch.value_counts().index,hole=0.6,color_discrete_sequence=[colors_green[3],colors_blue[3]])
fig.add_annotation(text='Lunch',x=0.5,y=0.5,showarrow=False)
fig.update_traces(textposition='outside',textinfo='percent+label')
fig.update_layout(font_family='monospace',title=dict(text='Eligible For Lunch ?',x=0.5,y=0.98,font=dict(color=colors_dark[2],size=20)))

## BAR PLOTS (IMPACT OF DIFFERENT FIELDS ON SCORES)

In [None]:
temp=df.groupby('gender')['pretest','posttest'].mean()

sns.set_style('darkgrid')
plt.rcParams['font.size']=15
plt.figure(figsize=(13,7))
ind=np.arange(df.gender.nunique())
width=0.35
plt.bar(ind,temp.pretest,width,label='Pretest',color=colors_green[3])
plt.bar(ind+width,temp.posttest,width,label='Posttest',color=colors_blue[3])
plt.xticks(ind+width,temp.index)
plt.legend(loc='best')
plt.xlabel('Gender')
plt.ylabel('Scores')
plt.title('Gender Wise Comparison Of Test Scores')
plt.show()

In [None]:
temp=df.groupby('school_type')['pretest','posttest'].mean()
sns.set_style('darkgrid')
plt.rcParams['font.size']=15
plt.figure(figsize=(13,7))

ind=np.arange(df.school_type.nunique())
width=0.3

plt.bar(ind,temp.pretest,width,color=colors_green[3],label='Pretest')
plt.bar(ind+width,temp.posttest,width,color=colors_blue[3],label='Posttest')
plt.title('Effect of School Type on Scores')
plt.xticks(ind+width,temp.index)
plt.xlabel('School Type')
plt.ylabel('Scores')
plt.legend(loc='best')
plt.show()

In [None]:
temp=df.groupby('teaching_method')['pretest','posttest'].mean()

sns.set_style('darkgrid')
plt.rcParams['font.size']=15
plt.figure(figsize=(13,7))

ind=np.arange(df.teaching_method.nunique())
width=0.25

plt.bar(ind,temp.pretest,width,color=colors_green[3],label='Pretest')
plt.bar(ind+width,temp.posttest,width,color=colors_blue[3],label='Posttest')
plt.title('Effect of Teaching Method on Scores')
plt.xticks(ind+width,temp.index)
plt.xlabel('Teaching Method')
plt.ylabel('Scores')
plt.show()

In [None]:
temp=df.groupby('school_setting')['pretest','posttest'].mean()

sns.set_style('darkgrid')
plt.rcParams['font.size']=15
plt.figure(figsize=(13,7))

ind=np.arange(df.school_setting.nunique())
width=0.25

plt.bar(ind,temp.pretest,width,color=colors_green[3],label='Pretest')
plt.bar(ind+width,temp.posttest,width,color=colors_blue[3],label='Posttest')
plt.title('Effect of School Setting on Scores')
plt.xticks(ind+width,temp.index)
plt.ylabel('Scores')
plt.show()

In [None]:
temp=df.groupby('lunch')['pretest','posttest'].mean()

sns.set_style('darkgrid')
plt.rcParams['font.size']=15
plt.figure(figsize=(12,7))

ind=np.arange(df.lunch.nunique())
width=0.25

plt.bar(ind,temp.pretest,width,color=colors_green[3],label='Pretest')
plt.bar(ind+width,temp.posttest,width,color=colors_blue[3],label='Posttest')
plt.title('Effect of Being Qualified for Lunch on Scores')
plt.xticks(ind+width,['Not Qualified','Qualified'])
plt.ylabel('Scores')
plt.show()

In [None]:
temp=df.groupby('school')['pretest','posttest'].mean().sort_values(['posttest','pretest'],ascending=False)
plt.figure(figsize=(12,7))
sns.set_style('darkgrid')
plt.rcParams['font.size']=15

ind=np.arange(len(temp))
width=0.35

plt.bar(ind,temp.pretest,width,label='Pretest',color=colors_green[3]);
plt.bar(ind+width,temp.posttest,width,label='Posttest',color=colors_blue[3]);
plt.xticks(ind,temp.index,rotation=90);
plt.title('Top 10 Schools with Best Posttest Result');

### PAIR PLOT

In [None]:
plt.rcParams['font.size']=10
sns.pairplot(df,hue='gender')

# PREPROCESSING

In [None]:
df.isnull().sum().sum()

**Dataset has no null Values**

In [None]:
numeric_cols=df.select_dtypes(np.number).columns.tolist()[:-1]
categorical_cols=df.select_dtypes('object').columns.tolist()[1:]

x=df[numeric_cols+categorical_cols]
y=df['posttest']

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler().fit(x[numeric_cols])
x[numeric_cols]=scaler.transform(x[numeric_cols])


from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False).fit(x[categorical_cols])
encoded_cols=encoder.get_feature_names(categorical_cols).tolist()
x[encoded_cols]=encoder.transform(x[categorical_cols])

x=x[encoded_cols+numeric_cols]

# TRAINING

In [None]:
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV,RepeatedStratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import explained_variance_score

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)


models=[('LR',LinearRegression()),('DT',DecisionTreeRegressor()),('RF',RandomForestRegressor()),('XGB',XGBRegressor())
        ,('LOG R',LogisticRegression(solver='liblinear')),('SVR',SVR())]
scores=[]

for name,model in models:
    model.fit(x_train,y_train)
    preds=model.predict(x_test)
    score=explained_variance_score(preds,y_test)
    scores.append([name,model,score])


In [None]:
scores_df=pd.DataFrame(scores,columns=['Name','Model','Score'])
scores_df.sort_values('Score',ascending=False)

**LR,XGB and RF are outperforming other models, So we will consider only these for further HyperParameter Tuning**

# HYPERPARAMETER TUNING

In [None]:
param_grid={
    'LR':
    {
        'model':LinearRegression(n_jobs=-1),
        'params':
        {
            'normalize':[True,False],
            'fit_intercept':[True,False]
            
        }
    },
    
    'RF':
    {
        'model':RandomForestRegressor(n_jobs=-1,random_state=42),
        'params':
        {
            'n_estimators':[10,20,100],
            'max_depth':[4,8,12,16,20],
            'max_features':['auto','sqrt','log2'],
            
            
        }
    },
    
    'XGB':
    {
        'model':XGBRegressor(),
        'params':
        {
            'n_estimators':[10,20,100],
            'max_depth':[4,8,12,16,20],
            'gamma':[0,0.1,0.001,0.2,1],
            'reg_alpha':[1,0,0.1,0.001,2]
            
        }
    }
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV
cv=RepeatedStratifiedKFold(n_splits=5,n_repeats=2)
scoress=[]
for model_name,params in param_grid.items():
    rv=RandomizedSearchCV(params['model'],params['params'],cv=cv,n_iter=20)
    rv.fit(x_train,y_train)
    scoress.append([model_name,dict(rv.best_params_),rv.best_score_])
    
data=pd.DataFrame(scoress,columns=['Model','Parameters','Score'])
data

****Random Forest and XGBoost show higher accuracy after hyperparameter tuning. So, we will ensemble both of them****

# FINAL MODEL

In [None]:
from sklearn.ensemble import VotingRegressor

params=data['Parameters']
model=VotingRegressor(estimators=[('RF',RandomForestRegressor(n_jobs=-1,random_state=42,**params[1])),
                                  ('XGB',XGBRegressor(**params[2]))])

accuracy=[]
skf=RepeatedStratifiedKFold(n_splits=5,n_repeats=2)
skf.get_n_splits(x,y)

for train_index,test_index in skf.split(x,y):
    x_train,x_test=x.loc[train_index],x.loc[test_index]
    y_train,y_test=y.loc[train_index],y.loc[test_index]
    
    model.fit(x_train,y_train)
    preds=model.predict(x_test)
    score=explained_variance_score(preds,y_test)
    accuracy.append(score)

In [None]:
np.mean(accuracy)

**Ensembling seems to have affected the results negatively. We should use RandomForest instead**